aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/fs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2016-04-18 11:18:55 +0200
committerJiri Kosina <jkosina@suse.cz>2016-04-18 11:18:55 +0200
commit9938b04472d5c59f8bd8152a548533a8599596a2 (patch)
tree0fc8318100878c5e446076613ec02a97aa179119 /fs
parentDoc: treewide : Fix typos in DocBook/filesystem.xml (diff)
parentLinux 4.6-rc4 (diff)
downloadwireguard-linux-9938b04472d5c59f8bd8152a548533a8599596a2.tar.xz
wireguard-linux-9938b04472d5c59f8bd8152a548533a8599596a2.zip
Merge branch 'master' into for-next
Sync with Linus' tree so that patches against newer codebase can be applied. Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Makefile5
-rw-r--r--fs/9p/acl.c83
-rw-r--r--fs/9p/cache.c8
-rw-r--r--fs/9p/cache.h1
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/v9fs.h2
-rw-r--r--fs/9p/vfs_addr.c18
-rw-r--r--fs/9p/vfs_file.c12
-rw-r--r--fs/9p/vfs_inode.c33
-rw-r--r--fs/9p/vfs_inode_dotl.c24
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/9p/xattr.c38
-rw-r--r--fs/9p/xattr.h3
-rw-r--r--fs/9p/xattr_security.c80
-rw-r--r--fs/9p/xattr_trusted.c80
-rw-r--r--fs/9p/xattr_user.c80
-rw-r--r--fs/Kconfig20
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/adfs.h32
-rw-r--r--fs/adfs/dir.c6
-rw-r--r--fs/adfs/dir_f.c2
-rw-r--r--fs/adfs/dir_fplus.c2
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/amigaffs.c13
-rw-r--r--fs/affs/file.c39
-rw-r--r--fs/affs/inode.c1
-rw-r--r--fs/affs/namei.c1
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/affs/symlink.c9
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/file.c4
-rw-r--r--fs/afs/flock.c4
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/afs/proc.c25
-rw-r--r--fs/afs/super.c6
-rw-r--r--fs/afs/write.c30
-rw-r--r--fs/attr.c2
-rw-r--r--fs/autofs4/autofs_i.h72
-rw-r--r--fs/autofs4/dev-ioctl.c57
-rw-r--r--fs/autofs4/expire.c84
-rw-r--r--fs/autofs4/init.c10
-rw-r--r--fs/autofs4/inode.c52
-rw-r--r--fs/autofs4/root.c165
-rw-r--r--fs/autofs4/symlink.c25
-rw-r--r--fs/autofs4/waitq.c78
-rw-r--r--fs/bad_inode.c2
-rw-r--r--fs/befs/linuxvfs.c42
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c32
-rw-r--r--fs/binfmt_elf_fdpic.c70
-rw-r--r--fs/binfmt_misc.c12
-rw-r--r--fs/block_dev.c214
-rw-r--r--fs/btrfs/Makefile5
-rw-r--r--fs/btrfs/acl.c12
-rw-r--r--fs/btrfs/async-thread.c6
-rw-r--r--fs/btrfs/backref.c47
-rw-r--r--fs/btrfs/btrfs_inode.h4
-rw-r--r--fs/btrfs/check-integrity.c181
-rw-r--r--fs/btrfs/compression.c97
-rw-r--r--fs/btrfs/compression.h9
-rw-r--r--fs/btrfs/ctree.c57
-rw-r--r--fs/btrfs/ctree.h282
-rw-r--r--fs/btrfs/delayed-inode.c20
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c16
-rw-r--r--fs/btrfs/delayed-ref.h8
-rw-r--r--fs/btrfs/dev-replace.c138
-rw-r--r--fs/btrfs/dev-replace.h7
-rw-r--r--fs/btrfs/disk-io.c311
-rw-r--r--fs/btrfs/disk-io.h5
-rw-r--r--fs/btrfs/extent-tree.c446
-rw-r--r--fs/btrfs/extent-tree.h0
-rw-r--r--fs/btrfs/extent_io.c663
-rw-r--r--fs/btrfs/extent_io.h150
-rw-r--r--fs/btrfs/extent_map.c10
-rw-r--r--fs/btrfs/extent_map.h10
-rw-r--r--fs/btrfs/file-item.c111
-rw-r--r--fs/btrfs/file.c310
-rw-r--r--fs/btrfs/free-space-cache.c72
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/free-space-tree.c1605
-rw-r--r--fs/btrfs/free-space-tree.h72
-rw-r--r--fs/btrfs/inode-map.c30
-rw-r--r--fs/btrfs/inode-map.h1
-rw-r--r--fs/btrfs/inode.c915
-rw-r--r--fs/btrfs/ioctl.c534
-rw-r--r--fs/btrfs/locking.c2
-rw-r--r--fs/btrfs/lzo.c32
-rw-r--r--fs/btrfs/ordered-data.c6
-rw-r--r--fs/btrfs/print-tree.c23
-rw-r--r--fs/btrfs/props.c1
-rw-r--r--fs/btrfs/qgroup.c81
-rw-r--r--fs/btrfs/raid56.c128
-rw-r--r--fs/btrfs/reada.c296
-rw-r--r--fs/btrfs/relocation.c36
-rw-r--r--fs/btrfs/root-tree.c12
-rw-r--r--fs/btrfs/scrub.c321
-rw-r--r--fs/btrfs/send.c69
-rw-r--r--fs/btrfs/send.h4
-rw-r--r--fs/btrfs/struct-funcs.c4
-rw-r--r--fs/btrfs/super.c169
-rw-r--r--fs/btrfs/sysfs.c35
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/tests/btrfs-tests.c61
-rw-r--r--fs/btrfs/tests/btrfs-tests.h10
-rw-r--r--fs/btrfs/tests/extent-io-tests.c189
-rw-r--r--fs/btrfs/tests/free-space-tests.c245
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c572
-rw-r--r--fs/btrfs/tests/inode-tests.c11
-rw-r--r--fs/btrfs/tests/qgroup-tests.c20
-rw-r--r--fs/btrfs/transaction.c96
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c27
-rw-r--r--fs/btrfs/tree-log.c253
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c300
-rw-r--r--fs/btrfs/volumes.h5
-rw-r--r--fs/btrfs/xattr.c241
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zlib.c38
-rw-r--r--fs/buffer.c171
-rw-r--r--fs/cachefiles/daemon.c25
-rw-r--r--fs/cachefiles/interface.c15
-rw-r--r--fs/cachefiles/internal.h6
-rw-r--r--fs/cachefiles/namei.c70
-rw-r--r--fs/cachefiles/rdwr.c113
-rw-r--r--fs/ceph/acl.c16
-rw-r--r--fs/ceph/addr.c451
-rw-r--r--fs/ceph/cache.c14
-rw-r--r--fs/ceph/caps.c120
-rw-r--r--fs/ceph/dir.c77
-rw-r--r--fs/ceph/export.c17
-rw-r--r--fs/ceph/file.c635
-rw-r--r--fs/ceph/inode.c70
-rw-r--r--fs/ceph/mds_client.c82
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/snap.c16
-rw-r--r--fs/ceph/super.c59
-rw-r--r--fs/ceph/super.h25
-rw-r--r--fs/ceph/xattr.c78
-rw-r--r--fs/cifs/cifs_debug.c58
-rw-r--r--fs/cifs/cifs_debug.h11
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifsencrypt.c34
-rw-r--r--fs/cifs/cifsfs.c108
-rw-r--r--fs/cifs/cifsfs.h18
-rw-r--r--fs/cifs/cifsglob.h32
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/cifssmb.c37
-rw-r--r--fs/cifs/connect.c119
-rw-r--r--fs/cifs/file.c118
-rw-r--r--fs/cifs/inode.c40
-rw-r--r--fs/cifs/ioctl.c123
-rw-r--r--fs/cifs/link.c10
-rw-r--r--fs/cifs/misc.c2
-rw-r--r--fs/cifs/readdir.c1
-rw-r--r--fs/cifs/smb2file.c19
-rw-r--r--fs/cifs/smb2misc.c36
-rw-r--r--fs/cifs/smb2ops.c19
-rw-r--r--fs/cifs/smb2pdu.c157
-rw-r--r--fs/cifs/smb2pdu.h53
-rw-r--r--fs/cifs/smb2proto.h3
-rw-r--r--fs/cifs/smb2transport.c102
-rw-r--r--fs/cifs/smbencrypt.c26
-rw-r--r--fs/cifs/smbfsctl.h2
-rw-r--r--fs/cifs/transport.c6
-rw-r--r--fs/cifs/xattr.c16
-rw-r--r--fs/coda/cnode.c5
-rw-r--r--fs/coda/coda_linux.h3
-rw-r--r--fs/coda/dir.c4
-rw-r--r--fs/coda/file.c8
-rw-r--r--fs/coda/inode.c6
-rw-r--r--fs/coda/symlink.c4
-rw-r--r--fs/compat.c21
-rw-r--r--fs/compat_ioctl.c296
-rw-r--r--fs/configfs/configfs_internal.h14
-rw-r--r--fs/configfs/dir.c377
-rw-r--r--fs/configfs/file.c265
-rw-r--r--fs/configfs/inode.c26
-rw-r--r--fs/configfs/item.c1
-rw-r--r--fs/configfs/mount.c4
-rw-r--r--fs/configfs/symlink.c22
-rw-r--r--fs/coredump.c93
-rw-r--r--fs/cramfs/README26
-rw-r--r--fs/cramfs/inode.c33
-rw-r--r--fs/crypto/Kconfig18
-rw-r--r--fs/crypto/Makefile3
-rw-r--r--fs/crypto/crypto.c568
-rw-r--r--fs/crypto/fname.c (renamed from fs/f2fs/crypto_fname.c)276
-rw-r--r--fs/crypto/keyinfo.c272
-rw-r--r--fs/crypto/policy.c229
-rw-r--r--fs/dax.c649
-rw-r--r--fs/dcache.c227
-rw-r--r--fs/debugfs/inode.c30
-rw-r--r--fs/devpts/inode.c32
-rw-r--r--fs/direct-io.c71
-rw-r--r--fs/dlm/config.c329
-rw-r--r--fs/dlm/lowcomms.c86
-rw-r--r--fs/dlm/user.c11
-rw-r--r--fs/ecryptfs/crypto.c156
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h13
-rw-r--r--fs/ecryptfs/inode.c65
-rw-r--r--fs/ecryptfs/keystore.c226
-rw-r--r--fs/ecryptfs/main.c15
-rw-r--r--fs/ecryptfs/mmap.c49
-rw-r--r--fs/ecryptfs/read_write.c14
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/efivarfs/file.c74
-rw-r--r--fs/efivarfs/inode.c30
-rw-r--r--fs/efivarfs/internal.h3
-rw-r--r--fs/efivarfs/super.c24
-rw-r--r--fs/efs/inode.c1
-rw-r--r--fs/efs/super.c6
-rw-r--r--fs/efs/symlink.c4
-rw-r--r--fs/eventfd.c42
-rw-r--r--fs/eventpoll.c56
-rw-r--r--fs/exec.c108
-rw-r--r--fs/exofs/dir.c30
-rw-r--r--fs/exofs/file.c4
-rw-r--r--fs/exofs/inode.c40
-rw-r--r--fs/exofs/namei.c8
-rw-r--r--fs/exofs/super.c4
-rw-r--r--fs/exportfs/expfs.c12
-rw-r--r--fs/ext2/dir.c36
-rw-r--r--fs/ext2/ext2.h14
-rw-r--r--fs/ext2/file.c73
-rw-r--r--fs/ext2/inode.c27
-rw-r--r--fs/ext2/ioctl.c12
-rw-r--r--fs/ext2/namei.c10
-rw-r--r--fs/ext2/super.c32
-rw-r--r--fs/ext2/symlink.c5
-rw-r--r--fs/ext2/xattr.c161
-rw-r--r--fs/ext2/xattr.h21
-rw-r--r--fs/ext2/xattr_security.c30
-rw-r--r--fs/ext2/xattr_trusted.c32
-rw-r--r--fs/ext2/xattr_user.c32
-rw-r--r--fs/ext4/balloc.c7
-rw-r--r--fs/ext4/crypto.c135
-rw-r--r--fs/ext4/crypto_fname.c32
-rw-r--r--fs/ext4/crypto_key.c46
-rw-r--r--fs/ext4/dir.c19
-rw-r--r--fs/ext4/ext4.h261
-rw-r--r--fs/ext4/ext4_crypto.h2
-rw-r--r--fs/ext4/ext4_extents.h2
-rw-r--r--fs/ext4/extents.c303
-rw-r--r--fs/ext4/extents_status.c4
-rw-r--r--fs/ext4/file.c235
-rw-r--r--fs/ext4/ialloc.c15
-rw-r--r--fs/ext4/indirect.c29
-rw-r--r--fs/ext4/inline.c36
-rw-r--r--fs/ext4/inode.c801
-rw-r--r--fs/ext4/ioctl.c393
-rw-r--r--fs/ext4/mballoc.c127
-rw-r--r--fs/ext4/mballoc.h12
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c34
-rw-r--r--fs/ext4/move_extent.c43
-rw-r--r--fs/ext4/namei.c65
-rw-r--r--fs/ext4/page-io.c37
-rw-r--r--fs/ext4/readpage.c16
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c213
-rw-r--r--fs/ext4/symlink.c35
-rw-r--r--fs/ext4/sysfs.c2
-rw-r--r--fs/ext4/truncate.h2
-rw-r--r--fs/ext4/xattr.c222
-rw-r--r--fs/ext4/xattr.h3
-rw-r--r--fs/ext4/xattr_security.c31
-rw-r--r--fs/ext4/xattr_trusted.c32
-rw-r--r--fs/ext4/xattr_user.c32
-rw-r--r--fs/f2fs/Kconfig12
-rw-r--r--fs/f2fs/Makefile2
-rw-r--r--fs/f2fs/checkpoint.c252
-rw-r--r--fs/f2fs/crypto.c491
-rw-r--r--fs/f2fs/crypto_key.c254
-rw-r--r--fs/f2fs/crypto_policy.c209
-rw-r--r--fs/f2fs/data.c735
-rw-r--r--fs/f2fs/debug.c41
-rw-r--r--fs/f2fs/dir.c136
-rw-r--r--fs/f2fs/extent_cache.c270
-rw-r--r--fs/f2fs/f2fs.h465
-rw-r--r--fs/f2fs/f2fs_crypto.h151
-rw-r--r--fs/f2fs/file.c540
-rw-r--r--fs/f2fs/gc.c254
-rw-r--r--fs/f2fs/gc.h8
-rw-r--r--fs/f2fs/inline.c62
-rw-r--r--fs/f2fs/inode.c44
-rw-r--r--fs/f2fs/namei.c261
-rw-r--r--fs/f2fs/node.c393
-rw-r--r--fs/f2fs/node.h32
-rw-r--r--fs/f2fs/recovery.c56
-rw-r--r--fs/f2fs/segment.c514
-rw-r--r--fs/f2fs/segment.h5
-rw-r--r--fs/f2fs/shrinker.c3
-rw-r--r--fs/f2fs/super.c454
-rw-r--r--fs/f2fs/trace.c6
-rw-r--r--fs/f2fs/xattr.c131
-rw-r--r--fs/f2fs/xattr.h5
-rw-r--r--fs/fat/Kconfig18
-rw-r--r--fs/fat/cache.c79
-rw-r--r--fs/fat/dir.c22
-rw-r--r--fs/fat/fat.h8
-rw-r--r--fs/fat/fatent.c24
-rw-r--r--fs/fat/file.c69
-rw-r--r--fs/fat/inode.c110
-rw-r--r--fs/fcntl.c3
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file.c13
-rw-r--r--fs/filesystems.c6
-rw-r--r--fs/freevxfs/vxfs_immed.c4
-rw-r--r--fs/freevxfs/vxfs_inode.c1
-rw-r--r--fs/freevxfs/vxfs_lookup.c12
-rw-r--r--fs/freevxfs/vxfs_subr.c2
-rw-r--r--fs/fs-writeback.c83
-rw-r--r--fs/fscache/cookie.c2
-rw-r--r--fs/fscache/netfs.c38
-rw-r--r--fs/fscache/page.c18
-rw-r--r--fs/fuse/cuse.c6
-rw-r--r--fs/fuse/dev.c26
-rw-r--r--fs/fuse/dir.c27
-rw-r--r--fs/fuse/file.c231
-rw-r--r--fs/fuse/fuse_i.h12
-rw-r--r--fs/fuse/inode.c20
-rw-r--r--fs/gfs2/acl.c4
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c46
-rw-r--r--fs/gfs2/bmap.c25
-rw-r--r--fs/gfs2/dir.c186
-rw-r--r--fs/gfs2/export.c2
-rw-r--r--fs/gfs2/file.c56
-rw-r--r--fs/gfs2/glock.c112
-rw-r--r--fs/gfs2/glock.h30
-rw-r--r--fs/gfs2/glops.c12
-rw-r--r--fs/gfs2/incore.h27
-rw-r--r--fs/gfs2/inode.c142
-rw-r--r--fs/gfs2/inode.h5
-rw-r--r--fs/gfs2/lock_dlm.c2
-rw-r--r--fs/gfs2/log.c3
-rw-r--r--fs/gfs2/main.c21
-rw-r--r--fs/gfs2/meta_io.c86
-rw-r--r--fs/gfs2/meta_io.h2
-rw-r--r--fs/gfs2/ops_fstype.c16
-rw-r--r--fs/gfs2/quota.c139
-rw-r--r--fs/gfs2/quota.h2
-rw-r--r--fs/gfs2/rgrp.c65
-rw-r--r--fs/gfs2/rgrp.h6
-rw-r--r--fs/gfs2/super.c67
-rw-r--r--fs/gfs2/trans.c4
-rw-r--r--fs/gfs2/util.c2
-rw-r--r--fs/gfs2/util.h2
-rw-r--r--fs/gfs2/xattr.c73
-rw-r--r--fs/gfs2/xattr.h1
-rw-r--r--fs/hfs/bnode.c12
-rw-r--r--fs/hfs/btree.c20
-rw-r--r--fs/hfs/catalog.c6
-rw-r--r--fs/hfs/dir.c4
-rw-r--r--fs/hfs/inode.c16
-rw-r--r--fs/hfs/mdb.c4
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/bitmap.c2
-rw-r--r--fs/hfsplus/bnode.c90
-rw-r--r--fs/hfsplus/btree.c22
-rw-r--r--fs/hfsplus/dir.c4
-rw-r--r--fs/hfsplus/inode.c18
-rw-r--r--fs/hfsplus/ioctl.c4
-rw-r--r--fs/hfsplus/posix_acl.c8
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hfsplus/xattr.c39
-rw-r--r--fs/hfsplus/xattr_security.c21
-rw-r--r--fs/hfsplus/xattr_trusted.c21
-rw-r--r--fs/hfsplus/xattr_user.c21
-rw-r--r--fs/hostfs/hostfs_kern.c50
-rw-r--r--fs/hpfs/dir.c6
-rw-r--r--fs/hpfs/inode.c1
-rw-r--r--fs/hpfs/map.c2
-rw-r--r--fs/hpfs/namei.c38
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c236
-rw-r--r--fs/inode.c27
-rw-r--r--fs/internal.h9
-rw-r--r--fs/ioctl.c75
-rw-r--r--fs/isofs/compress.c36
-rw-r--r--fs/isofs/inode.c5
-rw-r--r--fs/isofs/rock.c4
-rw-r--r--fs/jbd2/commit.c53
-rw-r--r--fs/jbd2/journal.c45
-rw-r--r--fs/jbd2/recovery.c31
-rw-r--r--fs/jbd2/revoke.c60
-rw-r--r--fs/jbd2/transaction.c48
-rw-r--r--fs/jffs2/README.Locking5
-rw-r--r--fs/jffs2/background.c7
-rw-r--r--fs/jffs2/build.c83
-rw-r--r--fs/jffs2/debug.c8
-rw-r--r--fs/jffs2/dir.c11
-rw-r--r--fs/jffs2/file.c66
-rw-r--r--fs/jffs2/fs.c13
-rw-r--r--fs/jffs2/gc.c89
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/nodelist.c8
-rw-r--r--fs/jffs2/nodelist.h6
-rw-r--r--fs/jffs2/nodemgmt.c4
-rw-r--r--fs/jffs2/security.c30
-rw-r--r--fs/jffs2/super.c7
-rw-r--r--fs/jffs2/symlink.c2
-rw-r--r--fs/jffs2/wbuf.c10
-rw-r--r--fs/jffs2/write.c7
-rw-r--r--fs/jffs2/xattr.c25
-rw-r--r--fs/jffs2/xattr_trusted.c26
-rw-r--r--fs/jffs2/xattr_user.c28
-rw-r--r--fs/jfs/acl.c8
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/jfs/ioctl.c6
-rw-r--r--fs/jfs/jfs_logmgr.c9
-rw-r--r--fs/jfs/jfs_metapage.c42
-rw-r--r--fs/jfs/jfs_metapage.h4
-rw-r--r--fs/jfs/namei.c4
-rw-r--r--fs/jfs/super.c13
-rw-r--r--fs/jfs/symlink.c5
-rw-r--r--fs/kernfs/dir.c259
-rw-r--r--fs/kernfs/inode.c4
-rw-r--r--fs/kernfs/mount.c73
-rw-r--r--fs/kernfs/symlink.c24
-rw-r--r--fs/libfs.c56
-rw-r--r--fs/lockd/host.c8
-rw-r--r--fs/lockd/mon.c125
-rw-r--r--fs/lockd/netns.h4
-rw-r--r--fs/lockd/svc.c81
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/locks.c166
-rw-r--r--fs/logfs/Kconfig2
-rw-r--r--fs/logfs/dev_bdev.c2
-rw-r--r--fs/logfs/dev_mtd.c10
-rw-r--r--fs/logfs/dir.c21
-rw-r--r--fs/logfs/file.c34
-rw-r--r--fs/logfs/inode.c6
-rw-r--r--fs/logfs/logfs.h7
-rw-r--r--fs/logfs/readwrite.c24
-rw-r--r--fs/logfs/segment.c32
-rw-r--r--fs/logfs/super.c16
-rw-r--r--fs/mbcache.c1093
-rw-r--r--fs/minix/dir.c18
-rw-r--r--fs/minix/inode.c6
-rw-r--r--fs/minix/itree_v1.c9
-rw-r--r--fs/minix/itree_v2.c9
-rw-r--r--fs/minix/namei.c4
-rw-r--r--fs/mpage.c29
-rw-r--r--fs/namei.c690
-rw-r--r--fs/namespace.c59
-rw-r--r--fs/ncpfs/dir.c22
-rw-r--r--fs/ncpfs/file.c4
-rw-r--r--fs/ncpfs/inode.c6
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/ncpfs/ncplib_kernel.h2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c103
-rw-r--r--fs/nfs/blocklayout/blocklayout.h18
-rw-r--r--fs/nfs/blocklayout/dev.c144
-rw-r--r--fs/nfs/blocklayout/extent_tree.c54
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c2
-rw-r--r--fs/nfs/callback.c40
-rw-r--r--fs/nfs/callback.h15
-rw-r--r--fs/nfs/callback_proc.c123
-rw-r--r--fs/nfs/callback_xdr.c51
-rw-r--r--fs/nfs/client.c9
-rw-r--r--fs/nfs/delegation.c6
-rw-r--r--fs/nfs/dir.c59
-rw-r--r--fs/nfs/direct.c68
-rw-r--r--fs/nfs/file.c48
-rw-r--r--fs/nfs/filelayout/filelayout.c20
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c253
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h8
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c119
-rw-r--r--fs/nfs/inode.c131
-rw-r--r--fs/nfs/internal.h55
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/nfs3acl.c4
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c206
-rw-r--r--fs/nfs/nfs42xdr.c97
-rw-r--r--fs/nfs/nfs4_fs.h6
-rw-r--r--fs/nfs/nfs4client.c2
-rw-r--r--fs/nfs/nfs4file.c117
-rw-r--r--fs/nfs/nfs4proc.c426
-rw-r--r--fs/nfs/nfs4session.c54
-rw-r--r--fs/nfs/nfs4session.h8
-rw-r--r--fs/nfs/nfs4sysctl.c2
-rw-r--r--fs/nfs/nfs4trace.c1
-rw-r--r--fs/nfs/nfs4trace.h431
-rw-r--r--fs/nfs/nfs4xdr.c56
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/objlayout/objio_osd.c7
-rw-r--r--fs/nfs/pagelist.c132
-rw-r--r--fs/nfs/pnfs.c415
-rw-r--r--fs/nfs/pnfs.h58
-rw-r--r--fs/nfs/pnfs_nfs.c26
-rw-r--r--fs/nfs/read.c68
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/symlink.c39
-rw-r--r--fs/nfs/write.c151
-rw-r--r--fs/nfsd/Kconfig28
-rw-r--r--fs/nfsd/Makefile4
-rw-r--r--fs/nfsd/blocklayout.c298
-rw-r--r--fs/nfsd/blocklayoutxdr.c77
-rw-r--r--fs/nfsd/blocklayoutxdr.h14
-rw-r--r--fs/nfsd/lockd.c2
-rw-r--r--fs/nfsd/netns.h2
-rw-r--r--fs/nfsd/nfs3proc.c7
-rw-r--r--fs/nfsd/nfs3xdr.c6
-rw-r--r--fs/nfsd/nfs4callback.c6
-rw-r--r--fs/nfsd/nfs4layouts.c104
-rw-r--r--fs/nfsd/nfs4proc.c80
-rw-r--r--fs/nfsd/nfs4recover.c47
-rw-r--r--fs/nfsd/nfs4state.c364
-rw-r--r--fs/nfsd/nfs4xdr.c55
-rw-r--r--fs/nfsd/nfscache.c32
-rw-r--r--fs/nfsd/nfsfh.c5
-rw-r--r--fs/nfsd/nfsfh.h47
-rw-r--r--fs/nfsd/nfssvc.c75
-rw-r--r--fs/nfsd/pnfs.h8
-rw-r--r--fs/nfsd/state.h51
-rw-r--r--fs/nfsd/trace.c2
-rw-r--r--fs/nfsd/trace.h43
-rw-r--r--fs/nfsd/vfs.c58
-rw-r--r--fs/nfsd/vfs.h25
-rw-r--r--fs/nfsd/xdr4.h12
-rw-r--r--fs/nilfs2/alloc.c308
-rw-r--r--fs/nilfs2/alloc.h1
-rw-r--r--fs/nilfs2/bmap.c2
-rw-r--r--fs/nilfs2/btnode.c10
-rw-r--r--fs/nilfs2/btree.c7
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/dir.c32
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/inode.c13
-rw-r--r--fs/nilfs2/ioctl.c4
-rw-r--r--fs/nilfs2/mdt.c20
-rw-r--r--fs/nilfs2/mdt.h2
-rw-r--r--fs/nilfs2/namei.c11
-rw-r--r--fs/nilfs2/page.c20
-rw-r--r--fs/nilfs2/recovery.c8
-rw-r--r--fs/nilfs2/segment.c109
-rw-r--r--fs/nilfs2/segment.h3
-rw-r--r--fs/nilfs2/sufile.c11
-rw-r--r--fs/nilfs2/super.c26
-rw-r--r--fs/notify/inode_mark.c3
-rw-r--r--fs/notify/mark.c49
-rw-r--r--fs/ntfs/aops.c50
-rw-r--r--fs/ntfs/aops.h4
-rw-r--r--fs/ntfs/attrib.c28
-rw-r--r--fs/ntfs/bitmap.c10
-rw-r--r--fs/ntfs/compress.c77
-rw-r--r--fs/ntfs/dir.c60
-rw-r--r--fs/ntfs/file.c68
-rw-r--r--fs/ntfs/index.c14
-rw-r--r--fs/ntfs/inode.c12
-rw-r--r--fs/ntfs/lcnalloc.c6
-rw-r--r--fs/ntfs/logfile.c16
-rw-r--r--fs/ntfs/mft.c34
-rw-r--r--fs/ntfs/ntfs.h2
-rw-r--r--fs/ntfs/quota.c6
-rw-r--r--fs/ntfs/super.c88
-rw-r--r--fs/ocfs2/Makefile3
-rw-r--r--fs/ocfs2/alloc.c180
-rw-r--r--fs/ocfs2/alloc.h2
-rw-r--r--fs/ocfs2/aops.c1192
-rw-r--r--fs/ocfs2/aops.h19
-rw-r--r--fs/ocfs2/cluster/heartbeat.c243
-rw-r--r--fs/ocfs2/cluster/nodemanager.c305
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h37
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c30
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c13
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c164
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c56
-rw-r--r--fs/ocfs2/dlm/dlmthread.c13
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c6
-rw-r--r--fs/ocfs2/dlmglue.c14
-rw-r--r--fs/ocfs2/file.c199
-rw-r--r--fs/ocfs2/filecheck.c606
-rw-r--r--fs/ocfs2/filecheck.h49
-rw-r--r--fs/ocfs2/inode.c241
-rw-r--r--fs/ocfs2/inode.h9
-rw-r--r--fs/ocfs2/ioctl.c16
-rw-r--r--fs/ocfs2/journal.c26
-rw-r--r--fs/ocfs2/localalloc.c30
-rw-r--r--fs/ocfs2/locks.c5
-rw-r--r--fs/ocfs2/mmap.c14
-rw-r--r--fs/ocfs2/move_extents.c16
-rw-r--r--fs/ocfs2/namei.c52
-rw-r--r--fs/ocfs2/ocfs2.h28
-rw-r--r--fs/ocfs2/ocfs2_trace.h20
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c40
-rw-r--r--fs/ocfs2/refcounttree.c36
-rw-r--r--fs/ocfs2/resize.c25
-rw-r--r--fs/ocfs2/slot_map.c14
-rw-r--r--fs/ocfs2/stackglue.c3
-rw-r--r--fs/ocfs2/stackglue.h2
-rw-r--r--fs/ocfs2/suballoc.c12
-rw-r--r--fs/ocfs2/super.c66
-rw-r--r--fs/ocfs2/super.h2
-rw-r--r--fs/ocfs2/symlink.c3
-rw-r--r--fs/ocfs2/xattr.c209
-rw-r--r--fs/open.c23
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/Kconfig6
-rw-r--r--fs/orangefs/Makefile10
-rw-r--r--fs/orangefs/acl.c175
-rw-r--r--fs/orangefs/dcache.c138
-rw-r--r--fs/orangefs/devorangefs-req.c943
-rw-r--r--fs/orangefs/dir.c396
-rw-r--r--fs/orangefs/downcall.h133
-rw-r--r--fs/orangefs/file.c717
-rw-r--r--fs/orangefs/inode.c461
-rw-r--r--fs/orangefs/namei.c462
-rw-r--r--fs/orangefs/orangefs-bufmap.c556
-rw-r--r--fs/orangefs/orangefs-bufmap.h36
-rw-r--r--fs/orangefs/orangefs-cache.c161
-rw-r--r--fs/orangefs/orangefs-debug.h92
-rw-r--r--fs/orangefs/orangefs-debugfs.c454
-rw-r--r--fs/orangefs/orangefs-debugfs.h3
-rw-r--r--fs/orangefs/orangefs-dev-proto.h62
-rw-r--r--fs/orangefs/orangefs-kernel.h623
-rw-r--r--fs/orangefs/orangefs-mod.c293
-rw-r--r--fs/orangefs/orangefs-sysfs.c1772
-rw-r--r--fs/orangefs/orangefs-sysfs.h2
-rw-r--r--fs/orangefs/orangefs-utils.c1052
-rw-r--r--fs/orangefs/protocol.h455
-rw-r--r--fs/orangefs/super.c559
-rw-r--r--fs/orangefs/symlink.c19
-rw-r--r--fs/orangefs/upcall.h246
-rw-r--r--fs/orangefs/waitqueue.c357
-rw-r--r--fs/orangefs/xattr.c530
-rw-r--r--fs/overlayfs/copy_up.c103
-rw-r--r--fs/overlayfs/dir.c83
-rw-r--r--fs/overlayfs/inode.c114
-rw-r--r--fs/overlayfs/overlayfs.h7
-rw-r--r--fs/overlayfs/readdir.c76
-rw-r--r--fs/overlayfs/super.c114
-rw-r--r--fs/pipe.c71
-rw-r--r--fs/pnode.c9
-rw-r--r--fs/posix_acl.c43
-rw-r--r--fs/proc/array.c12
-rw-r--r--fs/proc/base.c147
-rw-r--r--fs/proc/fd.c15
-rw-r--r--fs/proc/inode.c24
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--fs/proc/meminfo.c34
-rw-r--r--fs/proc/namespaces.c17
-rw-r--r--fs/proc/page.c4
-rw-r--r--fs/proc/self.c22
-rw-r--r--fs/proc/task_mmu.c224
-rw-r--r--fs/proc/task_nommu.c49
-rw-r--r--fs/proc/thread_self.c23
-rw-r--r--fs/proc/vmcore.c11
-rw-r--r--fs/proc_namespace.c29
-rw-r--r--fs/pstore/inode.c10
-rw-r--r--fs/pstore/ram.c4
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/qnx6/dir.c16
-rw-r--r--fs/qnx6/inode.c7
-rw-r--r--fs/qnx6/qnx6.h2
-rw-r--r--fs/quota/dquot.c85
-rw-r--r--fs/quota/netlink.c5
-rw-r--r--fs/quota/quota.c70
-rw-r--r--fs/quota/quota_tree.c67
-rw-r--r--fs/quota/quota_v2.c10
-rw-r--r--fs/ramfs/inode.c5
-rw-r--r--fs/read_write.c548
-rw-r--r--fs/readdir.c2
-rw-r--r--fs/reiserfs/dir.c4
-rw-r--r--fs/reiserfs/file.c8
-rw-r--r--fs/reiserfs/inode.c45
-rw-r--r--fs/reiserfs/ioctl.c6
-rw-r--r--fs/reiserfs/journal.c30
-rw-r--r--fs/reiserfs/namei.c7
-rw-r--r--fs/reiserfs/prints.c9
-rw-r--r--fs/reiserfs/procfs.c5
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c6
-rw-r--r--fs/reiserfs/tail_conversion.c4
-rw-r--r--fs/reiserfs/xattr.c104
-rw-r--r--fs/reiserfs/xattr_acl.c8
-rw-r--r--fs/reiserfs/xattr_security.c23
-rw-r--r--fs/reiserfs/xattr_trusted.c22
-rw-r--r--fs/reiserfs/xattr_user.c21
-rw-r--r--fs/romfs/super.c5
-rw-r--r--fs/select.c14
-rw-r--r--fs/seq_file.c58
-rw-r--r--fs/splice.c55
-rw-r--r--fs/squashfs/block.c4
-rw-r--r--fs/squashfs/cache.c18
-rw-r--r--fs/squashfs/decompressor.c2
-rw-r--r--fs/squashfs/file.c24
-rw-r--r--fs/squashfs/file_direct.c22
-rw-r--r--fs/squashfs/inode.c2
-rw-r--r--fs/squashfs/lz4_wrapper.c8
-rw-r--r--fs/squashfs/lzo_wrapper.c8
-rw-r--r--fs/squashfs/page_actor.c4
-rw-r--r--fs/squashfs/page_actor.h2
-rw-r--r--fs/squashfs/super.c12
-rw-r--r--fs/squashfs/symlink.c9
-rw-r--r--fs/squashfs/xattr.c86
-rw-r--r--fs/squashfs/xz_wrapper.c4
-rw-r--r--fs/squashfs/zlib_wrapper.c4
-rw-r--r--fs/stat.c4
-rw-r--r--fs/super.c5
-rw-r--r--fs/sync.c7
-rw-r--r--fs/sysfs/group.c17
-rw-r--r--fs/sysv/dir.c18
-rw-r--r--fs/sysv/inode.c17
-rw-r--r--fs/sysv/namei.c4
-rw-r--r--fs/timerfd.c2
-rw-r--r--fs/tracefs/inode.c34
-rw-r--r--fs/ubifs/Kconfig15
-rw-r--r--fs/ubifs/Makefile1
-rw-r--r--fs/ubifs/debug.c8
-rw-r--r--fs/ubifs/dir.c29
-rw-r--r--fs/ubifs/file.c110
-rw-r--r--fs/ubifs/key.h6
-rw-r--r--fs/ubifs/lpt.c6
-rw-r--r--fs/ubifs/misc.c57
-rw-r--r--fs/ubifs/misc.h9
-rw-r--r--fs/ubifs/recovery.c8
-rw-r--r--fs/ubifs/super.c23
-rw-r--r--fs/ubifs/tnc.c3
-rw-r--r--fs/ubifs/ubifs.h53
-rw-r--r--fs/ubifs/xattr.c61
-rw-r--r--fs/udf/balloc.c98
-rw-r--r--fs/udf/dir.c13
-rw-r--r--fs/udf/file.c16
-rw-r--r--fs/udf/inode.c252
-rw-r--r--fs/udf/namei.c37
-rw-r--r--fs/udf/super.c62
-rw-r--r--fs/udf/symlink.c4
-rw-r--r--fs/udf/udfdecl.h26
-rw-r--r--fs/udf/unicode.c623
-rw-r--r--fs/ufs/Makefile2
-rw-r--r--fs/ufs/balloc.c6
-rw-r--r--fs/ufs/dir.c32
-rw-r--r--fs/ufs/inode.c9
-rw-r--r--fs/ufs/namei.c11
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/ufs/symlink.c42
-rw-r--r--fs/ufs/ufs.h4
-rw-r--r--fs/ufs/util.c4
-rw-r--r--fs/ufs/util.h2
-rw-r--r--fs/userfaultfd.c6
-rw-r--r--fs/utimes.c4
-rw-r--r--fs/xattr.c240
-rw-r--r--fs/xfs/Makefile5
-rw-r--r--fs/xfs/kmem.c10
-rw-r--r--fs/xfs/kmem.h1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c34
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h9
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c37
-rw-r--r--fs/xfs/libxfs/xfs_attr.c147
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c34
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h16
-rw-r--r--fs/xfs/libxfs/xfs_bit.c6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c284
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h15
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_btree.c111
-rw-r--r--fs/xfs/libxfs/xfs_btree.h42
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h16
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c18
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c5
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c5
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c8
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c37
-rw-r--r--fs/xfs/libxfs/xfs_format.h31
-rw-r--r--fs/xfs/libxfs/xfs_fs.h48
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c15
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c39
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c186
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h38
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c3
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h19
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h1
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h5
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c32
-rw-r--r--fs/xfs/libxfs/xfs_sb.c12
-rw-r--r--fs/xfs/libxfs/xfs_sb.h1
-rw-r--r--fs/xfs/libxfs/xfs_shared.h2
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c8
-rw-r--r--fs/xfs/xfs_acl.c37
-rw-r--r--fs/xfs/xfs_acl.h8
-rw-r--r--fs/xfs/xfs_aops.c1130
-rw-r--r--fs/xfs/xfs_aops.h8
-rw-r--r--fs/xfs/xfs_attr_list.c21
-rw-r--r--fs/xfs/xfs_bmap_util.c94
-rw-r--r--fs/xfs/xfs_buf.c50
-rw-r--r--fs/xfs/xfs_buf.h27
-rw-r--r--fs/xfs/xfs_buf_item.c10
-rw-r--r--fs/xfs/xfs_dir2_readdir.c4
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_dquot.c156
-rw-r--r--fs/xfs/xfs_error.c4
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_file.c246
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fsops.h1
-rw-r--r--fs/xfs/xfs_icache.c61
-rw-r--r--fs/xfs/xfs_inode.c267
-rw-r--r--fs/xfs/xfs_inode.h10
-rw-r--r--fs/xfs/xfs_inode_item.c83
-rw-r--r--fs/xfs/xfs_inode_item.h1
-rw-r--r--fs/xfs/xfs_ioctl.c234
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c85
-rw-r--r--fs/xfs/xfs_iops.c77
-rw-r--r--fs/xfs/xfs_itable.c22
-rw-r--r--fs/xfs/xfs_linux.h9
-rw-r--r--fs/xfs/xfs_log.c288
-rw-r--r--fs/xfs/xfs_log.h1
-rw-r--r--fs/xfs/xfs_log_priv.h54
-rw-r--r--fs/xfs/xfs_log_recover.c871
-rw-r--r--fs/xfs/xfs_message.c7
-rw-r--r--fs/xfs/xfs_mount.c47
-rw-r--r--fs/xfs/xfs_mount.h40
-rw-r--r--fs/xfs/xfs_ondisk.h117
-rw-r--r--fs/xfs/xfs_pnfs.c13
-rw-r--r--fs/xfs/xfs_pnfs.h2
-rw-r--r--fs/xfs/xfs_qm.c71
-rw-r--r--fs/xfs/xfs_qm.h48
-rw-r--r--fs/xfs/xfs_qm_syscalls.c27
-rw-r--r--fs/xfs/xfs_quotaops.c36
-rw-r--r--fs/xfs/xfs_rtalloc.c5
-rw-r--r--fs/xfs/xfs_stats.c93
-rw-r--r--fs/xfs/xfs_stats.h36
-rw-r--r--fs/xfs/xfs_super.c599
-rw-r--r--fs/xfs/xfs_super.h4
-rw-r--r--fs/xfs/xfs_symlink.c12
-rw-r--r--fs/xfs/xfs_sysctl.c15
-rw-r--r--fs/xfs/xfs_sysfs.c277
-rw-r--r--fs/xfs/xfs_sysfs.h1
-rw-r--r--fs/xfs/xfs_trace.h37
-rw-r--r--fs/xfs/xfs_trans.c10
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_ail.c17
-rw-r--r--fs/xfs/xfs_trans_buf.c10
-rw-r--r--fs/xfs/xfs_trans_dquot.c29
-rw-r--r--fs/xfs/xfs_trans_inode.c23
-rw-r--r--fs/xfs/xfs_xattr.c184
854 files changed, 45899 insertions, 21933 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ff7be98f84f2..9619ccadd2fc 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -10,10 +10,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
vfs_dentry.o \
v9fs.o \
fid.o \
- xattr.o \
- xattr_user.o \
- xattr_trusted.o
+ xattr.o
9p-$(CONFIG_9P_FSCACHE) += cache.o
9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
-9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 31c010372660..9da967f38387 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -67,8 +67,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
return 0;
}
/* get the default/access acl values and cache them */
- dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
- pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+ dacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_DEFAULT);
+ pacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_ACCESS);
if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
@@ -133,10 +133,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
goto err_free_out;
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -212,42 +212,22 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
return 0;
}
-static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- char *full_name;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- full_name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- full_name = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- BUG();
- }
- return v9fs_xattr_get(dentry, full_name, buffer, size);
-}
-
-static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct v9fs_session_info *v9ses;
struct posix_acl *acl;
int error;
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
v9ses = v9fs_dentry2v9ses(dentry);
/*
* We allow set/get/list of acl when access=client is not specified
*/
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+ return v9fs_xattr_get(dentry, handler->name, buffer, size);
- acl = v9fs_get_cached_acl(d_inode(dentry), type);
+ acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -258,46 +238,23 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
return error;
}
-static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size,
- int flags, int type)
-{
- char *full_name;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- full_name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- full_name = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- BUG();
- }
- return v9fs_xattr_set(dentry, full_name, value, size, flags);
-}
-
-
-static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size,
- int flags, int type)
+static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
int retval;
struct posix_acl *acl;
struct v9fs_session_info *v9ses;
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
v9ses = v9fs_dentry2v9ses(dentry);
/*
* set the attribute on the remote. Without even looking at the
* xattr value. We leave it to the server to validate
*/
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_remote_set_acl(dentry, name,
- value, size, flags, type);
+ return v9fs_xattr_set(dentry, handler->name, value, size,
+ flags);
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
@@ -316,9 +273,8 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
} else
acl = NULL;
- switch (type) {
+ switch (handler->flags) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
if (acl) {
umode_t mode = inode->i_mode;
retval = posix_acl_equiv_mode(acl, &mode);
@@ -349,7 +305,6 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
}
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
if (!S_ISDIR(inode->i_mode)) {
retval = acl ? -EINVAL : 0;
goto err_out;
@@ -358,23 +313,23 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
default:
BUG();
}
- retval = v9fs_xattr_set(dentry, name, value, size, flags);
+ retval = v9fs_xattr_set(dentry, handler->name, value, size, flags);
if (!retval)
- set_cached_acl(inode, type, acl);
+ set_cached_acl(inode, handler->flags, acl);
err_out:
posix_acl_release(acl);
return retval;
}
const struct xattr_handler v9fs_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.get = v9fs_xattr_get_acl,
.set = v9fs_xattr_set_acl,
};
const struct xattr_handler v9fs_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.get = v9fs_xattr_get_acl,
.set = v9fs_xattr_set_acl,
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index a69260f27555..103ca5e1267b 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -243,14 +243,14 @@ void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
if (!v9inode->fscache)
return;
- spin_lock(&v9inode->fscache_lock);
+ mutex_lock(&v9inode->fscache_lock);
if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
v9fs_cache_inode_flush_cookie(inode);
else
v9fs_cache_inode_get_cookie(inode);
- spin_unlock(&v9inode->fscache_lock);
+ mutex_unlock(&v9inode->fscache_lock);
}
void v9fs_cache_inode_reset_cookie(struct inode *inode)
@@ -264,7 +264,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
old = v9inode->fscache;
- spin_lock(&v9inode->fscache_lock);
+ mutex_lock(&v9inode->fscache_lock);
fscache_relinquish_cookie(v9inode->fscache, 1);
v9ses = v9fs_inode2v9ses(inode);
@@ -274,7 +274,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
inode, old, v9inode->fscache);
- spin_unlock(&v9inode->fscache_lock);
+ mutex_unlock(&v9inode->fscache_lock);
}
int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 2f9675491095..247e47e54bcc 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -21,6 +21,7 @@
*/
#ifndef _9P_CACHE_H
+#define _9P_CACHE_H
#ifdef CONFIG_9P_FSCACHE
#include <linux/fscache.h>
#include <linux/spinlock.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6caca025019d..072e7599583a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void)
v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
sizeof(struct v9fs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
v9fs_inode_init_once);
if (!v9fs_inode_cache)
return -ENOMEM;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 0923f2cf3c80..6877050384a1 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -123,7 +123,7 @@ struct v9fs_session_info {
struct v9fs_inode {
#ifdef CONFIG_9P_FSCACHE
- spinlock_t fscache_lock;
+ struct mutex fscache_lock;
struct fscache_cookie *fscache;
#endif
struct p9_qid qid;
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index e9e04376c52c..ac9225e86bf3 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -153,7 +153,7 @@ static void v9fs_invalidate_page(struct page *page, unsigned int offset,
* If called with zero offset, we should release
* the private state assocated with the page
*/
- if (offset == 0 && length == PAGE_CACHE_SIZE)
+ if (offset == 0 && length == PAGE_SIZE)
v9fs_fscache_invalidate_page(page);
}
@@ -166,10 +166,10 @@ static int v9fs_vfs_writepage_locked(struct page *page)
struct bio_vec bvec;
int err, len;
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
else
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
bvec.bv_page = page;
bvec.bv_offset = 0;
@@ -271,7 +271,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
int retval = 0;
struct page *page;
struct v9fs_inode *v9inode;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct inode *inode = mapping->host;
@@ -288,11 +288,11 @@ start:
if (PageUptodate(page))
goto out;
- if (len == PAGE_CACHE_SIZE)
+ if (len == PAGE_SIZE)
goto out;
retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
- page_cache_release(page);
+ put_page(page);
if (!retval)
goto start;
out:
@@ -313,7 +313,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
/*
* zero out the rest of the area
*/
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
zero_user(page, from + copied, len - copied);
flush_dcache_page(page);
@@ -331,7 +331,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
}
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 7bf835f85bc8..b84c291ba1eb 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -421,8 +421,8 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
loff_t i_size;
unsigned long pg_start, pg_end;
- pg_start = origin >> PAGE_CACHE_SHIFT;
- pg_end = (origin + retval - 1) >> PAGE_CACHE_SHIFT;
+ pg_start = origin >> PAGE_SHIFT;
+ pg_end = (origin + retval - 1) >> PAGE_SHIFT;
if (inode->i_mapping && inode->i_mapping->nrpages)
invalidate_inode_pages2_range(inode->i_mapping,
pg_start, pg_end);
@@ -449,14 +449,14 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
if (retval)
return retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
fid = filp->private_data;
v9fs_blank_wstat(&wstat);
retval = p9_client_wstat(fid, &wstat);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return retval;
}
@@ -472,13 +472,13 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
if (retval)
return retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
fid = filp->private_data;
retval = p9_client_fsync(fid, datasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return retval;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b1dc51888048..3a08b3e6ff1d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -244,7 +244,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
return NULL;
#ifdef CONFIG_9P_FSCACHE
v9inode->fscache = NULL;
- spin_lock_init(&v9inode->fscache_lock);
+ mutex_init(&v9inode->fscache_lock);
#endif
v9inode->writeback_fid = NULL;
v9inode->cache_validity = 0;
@@ -451,9 +451,9 @@ void v9fs_evict_inode(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
- truncate_inode_pages_final(inode->i_mapping);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- filemap_fdatawrite(inode->i_mapping);
+ filemap_fdatawrite(&inode->i_data);
v9fs_cache_inode_put_cookie(inode);
/* clunk the fid stashed in writeback_fid */
@@ -1223,18 +1223,26 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
}
/**
- * v9fs_vfs_follow_link - follow a symlink path
+ * v9fs_vfs_get_link - follow a symlink path
* @dentry: dentry for symlink
- * @cookie: place to pass the data to put_link()
+ * @inode: inode for symlink
+ * @done: delayed call for when we are done with the return value
*/
-static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *v9fs_vfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
- struct p9_fid *fid = v9fs_fid_lookup(dentry);
+ struct v9fs_session_info *v9ses;
+ struct p9_fid *fid;
struct p9_wstat *st;
char *res;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ v9ses = v9fs_dentry2v9ses(dentry);
+ fid = v9fs_fid_lookup(dentry);
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
if (IS_ERR(fid))
@@ -1259,7 +1267,8 @@ static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
p9stat_free(st);
kfree(st);
- return *cookie = res;
+ set_delayed_call(done, kfree_link, res);
+ return res;
}
/**
@@ -1368,9 +1377,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
dir->i_ino, dentry, mode,
MAJOR(rdev), MINOR(rdev));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
/* build extension */
if (S_ISBLK(mode))
sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
@@ -1455,8 +1461,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
static const struct inode_operations v9fs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = v9fs_vfs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = v9fs_vfs_get_link,
.getattr = v9fs_vfs_getattr,
.setattr = v9fs_vfs_setattr,
};
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index e8aa57dc8d6d..a34702c998f5 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -829,9 +829,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
v9ses = v9fs_inode2v9ses(dir);
dir_dentry = dentry->d_parent;
dfid = v9fs_fid_lookup(dir_dentry);
@@ -902,26 +899,34 @@ error:
}
/**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * v9fs_vfs_get_link_dotl - follow a symlink path
* @dentry: dentry for symlink
- * @cookie: place to pass the data to put_link()
+ * @inode: inode for symlink
+ * @done: destructor for return value
*/
static const char *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
+v9fs_vfs_get_link_dotl(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct p9_fid *fid = v9fs_fid_lookup(dentry);
+ struct p9_fid *fid;
char *target;
int retval;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
+ fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return ERR_CAST(fid);
retval = p9_client_readlink(fid, &target);
if (retval)
return ERR_PTR(retval);
- return *cookie = target;
+ set_delayed_call(done, kfree_link, target);
+ return target;
}
int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
@@ -987,8 +992,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
const struct inode_operations v9fs_symlink_inode_operations_dotl = {
.readlink = generic_readlink,
- .follow_link = v9fs_vfs_follow_link_dotl,
- .put_link = kfree_put_link,
+ .get_link = v9fs_vfs_get_link_dotl,
.getattr = v9fs_vfs_getattr_dotl,
.setattr = v9fs_vfs_setattr_dotl,
.setxattr = generic_setxattr,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index bf495cedec26..de3ed8629196 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -87,7 +87,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_op = &v9fs_super_ops;
sb->s_bdi = &v9ses->bdi;
if (v9ses->cache)
- sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
+ sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
if (!v9ses->cache)
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 0cf44b6cccd6..9dd9b47a6c1a 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -137,6 +137,44 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
}
+static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ const char *full_name = xattr_full_name(handler, name);
+
+ return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+
+static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ const char *full_name = xattr_full_name(handler, name);
+
+ return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+
+static struct xattr_handler v9fs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+
+static struct xattr_handler v9fs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+
+#ifdef CONFIG_9P_FS_SECURITY
+static struct xattr_handler v9fs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+#endif
+
const struct xattr_handler *v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
&v9fs_xattr_trusted_handler,
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index d3e2ea3840be..c63c3bea5de5 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -19,9 +19,6 @@
#include <net/9p/client.h>
extern const struct xattr_handler *v9fs_xattr_handlers[];
-extern struct xattr_handler v9fs_xattr_user_handler;
-extern struct xattr_handler v9fs_xattr_trusted_handler;
-extern struct xattr_handler v9fs_xattr_security_handler;
extern const struct xattr_handler v9fs_xattr_acl_access_handler;
extern const struct xattr_handler v9fs_xattr_acl_default_handler;
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
deleted file mode 100644
index cb247a142a6e..000000000000
--- a/fs/9p/xattr_security.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .get = v9fs_xattr_security_get,
- .set = v9fs_xattr_security_set,
-};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
deleted file mode 100644
index e30d33b8a3fb..000000000000
--- a/fs/9p/xattr_trusted.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_trusted_handler = {
- .prefix = XATTR_TRUSTED_PREFIX,
- .get = v9fs_xattr_trusted_get,
- .set = v9fs_xattr_trusted_set,
-};
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
deleted file mode 100644
index d0b701b72080..000000000000
--- a/fs/9p/xattr_user.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_USER_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_USER_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_user_handler = {
- .prefix = XATTR_USER_PREFIX,
- .get = v9fs_xattr_user_get,
- .set = v9fs_xattr_user_set,
-};
diff --git a/fs/Kconfig b/fs/Kconfig
index da3f32f1a4e4..6725f59c18e6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -46,6 +46,13 @@ config FS_DAX
or if unsure, say N. Saying Y will increase the size of the kernel
by about 5kB.
+config FS_DAX_PMD
+ bool
+ default FS_DAX
+ depends on FS_DAX
+ depends on ZONE_DEVICE
+ depends on TRANSPARENT_HUGEPAGE
+
endif # BLOCK
# Posix ACL utility routines
@@ -67,6 +74,18 @@ config FILE_LOCKING
for filesystems like NFS and for the flock() system
call. Disabling this option saves about 11k.
+config MANDATORY_FILE_LOCKING
+ bool "Enable Mandatory file locking"
+ depends on FILE_LOCKING
+ default y
+ help
+ This option enables files appropriately marked files on appropriely
+ mounted filesystems to support mandatory locking.
+
+ To the best of my knowledge this is dead code that no one cares about.
+
+source "fs/crypto/Kconfig"
+
source "fs/notify/Kconfig"
source "fs/quota/Kconfig"
@@ -190,6 +209,7 @@ menuconfig MISC_FILESYSTEMS
if MISC_FILESYSTEMS
+source "fs/orangefs/Kconfig"
source "fs/adfs/Kconfig"
source "fs/affs/Kconfig"
source "fs/ecryptfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 79f522575cba..85b6e13b62d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FS_DAX) += dax.o
+obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
@@ -105,6 +106,7 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4/
obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
obj-$(CONFIG_OVERLAY_FS) += overlayfs/
+obj-$(CONFIG_ORANGEFS_FS) += orangefs/
obj-$(CONFIG_UDF_FS) += udf/
obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
obj-$(CONFIG_OMFS_FS) += omfs/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 24575d9d882d..fadf408bdd46 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -44,24 +44,24 @@ struct adfs_dir_ops;
*/
struct adfs_sb_info {
union { struct {
- struct adfs_discmap *s_map; /* bh list containing map */
- struct adfs_dir_ops *s_dir; /* directory operations */
+ struct adfs_discmap *s_map; /* bh list containing map */
+ const struct adfs_dir_ops *s_dir; /* directory operations */
};
- struct rcu_head rcu; /* used only at shutdown time */
+ struct rcu_head rcu; /* used only at shutdown time */
};
- kuid_t s_uid; /* owner uid */
- kgid_t s_gid; /* owner gid */
- umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
- umode_t s_other_mask; /* ADFS other perm -> unix perm */
+ kuid_t s_uid; /* owner uid */
+ kgid_t s_gid; /* owner gid */
+ umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
+ umode_t s_other_mask; /* ADFS other perm -> unix perm */
int s_ftsuffix; /* ,xyz hex filetype suffix option */
- __u32 s_ids_per_zone; /* max. no ids in one zone */
- __u32 s_idlen; /* length of ID in map */
- __u32 s_map_size; /* sector size of a map */
- unsigned long s_size; /* total size (in blocks) of this fs */
- signed int s_map2blk; /* shift left by this for map->sector */
- unsigned int s_log2sharesize;/* log2 share size */
- __le32 s_version; /* disc format version */
+ __u32 s_ids_per_zone; /* max. no ids in one zone */
+ __u32 s_idlen; /* length of ID in map */
+ __u32 s_map_size; /* sector size of a map */
+ unsigned long s_size; /* total size (in blocks) of this fs */
+ signed int s_map2blk; /* shift left by this for map->sector*/
+ unsigned int s_log2sharesize;/* log2 share size */
+ __le32 s_version; /* disc format version */
unsigned int s_namelen; /* maximum number of characters in name */
};
@@ -168,8 +168,8 @@ void __adfs_error(struct super_block *sb, const char *function,
extern const struct inode_operations adfs_dir_inode_operations;
extern const struct file_operations adfs_dir_operations;
extern const struct dentry_operations adfs_dentry_operations;
-extern struct adfs_dir_ops adfs_f_dir_ops;
-extern struct adfs_dir_ops adfs_fplus_dir_ops;
+extern const struct adfs_dir_ops adfs_f_dir_ops;
+extern const struct adfs_dir_ops adfs_fplus_dir_ops;
extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
int wait);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 51c279a29845..fd4cf2c48e48 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -21,7 +21,7 @@ adfs_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+ const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct object_info obj;
struct adfs_dir dir;
int ret = 0;
@@ -69,7 +69,7 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
{
int ret = -EINVAL;
#ifdef CONFIG_ADFS_FS_RW
- struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+ const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct adfs_dir dir;
printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n",
@@ -129,7 +129,7 @@ static int
adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj)
{
struct super_block *sb = inode->i_sb;
- struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+ const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
struct adfs_dir dir;
int ret;
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 4bbe853ee50a..0fbfd0b04ae0 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -476,7 +476,7 @@ adfs_f_free(struct adfs_dir *dir)
dir->sb = NULL;
}
-struct adfs_dir_ops adfs_f_dir_ops = {
+const struct adfs_dir_ops adfs_f_dir_ops = {
.read = adfs_f_read,
.setpos = adfs_f_setpos,
.getnext = adfs_f_getnext,
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 82d14cdf70f9..c92cfb638c18 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -256,7 +256,7 @@ adfs_fplus_free(struct adfs_dir *dir)
dir->sb = NULL;
}
-struct adfs_dir_ops adfs_fplus_dir_ops = {
+const struct adfs_dir_ops adfs_fplus_dir_ops = {
.read = adfs_fplus_read,
.setpos = adfs_fplus_setpos,
.getnext = adfs_fplus_getnext,
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4d4a0df8344f..c9fdfb112933 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -271,7 +271,7 @@ static int __init init_inodecache(void)
adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
sizeof(struct adfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (adfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index c69a87eaf57d..cc2b2efc9211 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -138,7 +138,7 @@ extern int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh);
extern int affs_remove_header(struct dentry *dentry);
extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh);
extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
-extern void secs_to_datestamp(time_t secs, struct affs_date *ds);
+extern void secs_to_datestamp(time64_t secs, struct affs_date *ds);
extern umode_t prot_to_mode(u32 prot);
extern void mode_to_prot(struct inode *inode);
__printf(3, 4)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 5fa92bc790ef..d6c7a51c93e4 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -8,6 +8,7 @@
* Please send bug reports to: hjw@zvw.de
*/
+#include <linux/math64.h>
#include "affs.h"
/*
@@ -366,22 +367,22 @@ affs_fix_checksum(struct super_block *sb, struct buffer_head *bh)
}
void
-secs_to_datestamp(time_t secs, struct affs_date *ds)
+secs_to_datestamp(time64_t secs, struct affs_date *ds)
{
u32 days;
u32 minute;
+ s32 rem;
secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60);
if (secs < 0)
secs = 0;
- days = secs / 86400;
- secs -= days * 86400;
- minute = secs / 60;
- secs -= minute * 60;
+ days = div_s64_rem(secs, 86400, &rem);
+ minute = rem / 60;
+ rem -= minute * 60;
ds->days = cpu_to_be32(days);
ds->mins = cpu_to_be32(minute);
- ds->ticks = cpu_to_be32(secs * 50);
+ ds->ticks = cpu_to_be32(rem * 50);
}
umode_t
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 659c579c4588..0cde550050e8 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -33,11 +33,11 @@ affs_file_release(struct inode *inode, struct file *filp)
inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (inode->i_size != AFFS_I(inode)->mmu_private)
affs_truncate(inode);
affs_free_prealloc(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
@@ -510,11 +510,9 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
page->index, to);
- BUG_ON(to > PAGE_CACHE_SIZE);
- kmap(page);
- data = page_address(page);
+ BUG_ON(to > PAGE_SIZE);
bsize = AFFS_SB(sb)->s_data_blksize;
- tmp = page->index << PAGE_CACHE_SHIFT;
+ tmp = page->index << PAGE_SHIFT;
bidx = tmp / bsize;
boff = tmp % bsize;
@@ -524,14 +522,15 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
BUG_ON(pos + tmp > to || tmp > bsize);
+ data = kmap_atomic(page);
memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
+ kunmap_atomic(data);
affs_brelse(bh);
bidx++;
pos += tmp;
boff = 0;
}
flush_dcache_page(page);
- kunmap(page);
return 0;
}
@@ -614,10 +613,10 @@ affs_readpage_ofs(struct file *file, struct page *page)
int err;
pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
- to = PAGE_CACHE_SIZE;
- if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
- to = inode->i_size & ~PAGE_CACHE_MASK;
- memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to);
+ to = PAGE_SIZE;
+ if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) {
+ to = inode->i_size & ~PAGE_MASK;
+ memset(page_address(page) + to, 0, PAGE_SIZE - to);
}
err = affs_do_readpage_ofs(page, to);
@@ -647,7 +646,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return err;
}
- index = pos >> PAGE_CACHE_SHIFT;
+ index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -657,10 +656,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return 0;
/* XXX: inefficient but safe in the face of short writes */
- err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE);
+ err = affs_do_readpage_ofs(page, PAGE_SIZE);
if (err) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return err;
}
@@ -678,7 +677,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
u32 tmp;
int written;
- from = pos & (PAGE_CACHE_SIZE - 1);
+ from = pos & (PAGE_SIZE - 1);
to = pos + len;
/*
* XXX: not sure if this can handle short copies (len < copied), but
@@ -693,7 +692,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
bh = NULL;
written = 0;
- tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+ tmp = (page->index << PAGE_SHIFT) + from;
bidx = tmp / bsize;
boff = tmp % bsize;
if (boff) {
@@ -789,13 +788,13 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
done:
affs_brelse(bh);
- tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+ tmp = (page->index << PAGE_SHIFT) + from;
if (tmp > inode->i_size)
inode->i_size = AFFS_I(inode)->mmu_private = tmp;
err_first_bh:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return written;
@@ -958,12 +957,12 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = write_inode_now(inode, 0);
err = sync_blockdev(inode->i_sb->s_bdev);
if (!ret)
ret = err;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
const struct file_operations affs_file_operations = {
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 17349500592d..0fdb0f5b2239 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
break;
case ST_SOFTLINK:
inode->i_mode |= S_IFLNK;
+ inode_nohighmem(inode);
inode->i_op = &affs_symlink_inode_operations;
inode->i_data.a_ops = &affs_symlink_aops;
break;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 181e05b46e72..00d3002a6780 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -344,6 +344,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
return -ENOSPC;
inode->i_op = &affs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &affs_symlink_aops;
inode->i_mode = S_IFLNK | 0777;
mode_to_prot(inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 5b50c4ca43a7..2a6713b6b9f4 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -32,7 +32,7 @@ affs_commit_super(struct super_block *sb, int wait)
struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
lock_buffer(bh);
- secs_to_datestamp(get_seconds(), &tail->disk_change);
+ secs_to_datestamp(ktime_get_real_seconds(), &tail->disk_change);
affs_fix_checksum(sb, bh);
unlock_buffer(bh);
@@ -132,7 +132,7 @@ static int __init init_inodecache(void)
affs_inode_cachep = kmem_cache_create("affs_inode_cache",
sizeof(struct affs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (affs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index ea5b69a18ba9..69b03dbb792f 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -14,13 +14,13 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
{
struct buffer_head *bh;
struct inode *inode = page->mapping->host;
- char *link = kmap(page);
+ char *link = page_address(page);
struct slink_front *lf;
int i, j;
char c;
char lc;
- pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
+ pr_debug("get_link(ino=%lu)\n", inode->i_ino);
bh = affs_bread(inode->i_sb, inode->i_ino);
if (!bh)
@@ -57,12 +57,10 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
link[i] = '\0';
affs_brelse(bh);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return -EIO;
}
@@ -73,7 +71,6 @@ const struct address_space_operations affs_symlink_aops = {
const struct inode_operations affs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = affs_notify_change,
};
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index e10e17788f06..5fda2bc53cd7 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -181,7 +181,7 @@ error:
static inline void afs_dir_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
/*
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 999bc3caec92..6344aee4ac4b 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -164,7 +164,7 @@ int afs_page_filler(void *data, struct page *page)
_debug("cache said ENOBUFS");
default:
go_on:
- offset = page->index << PAGE_CACHE_SHIFT;
+ offset = page->index << PAGE_SHIFT;
len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
/* read the contents of the file from the server into the
@@ -319,7 +319,7 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
BUG_ON(!PageLocked(page));
/* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == PAGE_CACHE_SIZE) {
+ if (offset == 0 && length == PAGE_SIZE) {
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page)) {
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 4baf1d2b39e4..d91a9c9cfbd0 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -483,7 +483,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
fl->fl_type = F_UNLCK;
- mutex_lock(&vnode->vfs_inode.i_mutex);
+ inode_lock(&vnode->vfs_inode);
/* check local lock records first */
ret = 0;
@@ -505,7 +505,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
}
error:
- mutex_unlock(&vnode->vfs_inode.i_mutex);
+ inode_unlock(&vnode->vfs_inode);
_leave(" = %d [%hd]", ret, fl->fl_type);
return ret;
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e06f5a23352a..86cc7264c21c 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -56,6 +56,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
case AFS_FTYPE_SYMLINK:
inode->i_mode = S_IFLNK | vnode->status.mode;
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
default:
printk("kAFS: AFS vnode with undefined type\n");
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index ccd0b212e82a..81dd075356b9 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -93,7 +93,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
kunmap(page);
out_free:
- page_cache_release(page);
+ put_page(page);
out:
_leave(" = %d", ret);
return ret;
@@ -189,7 +189,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
buf = kmap_atomic(page);
memcpy(devname, buf, size);
kunmap_atomic(buf);
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
@@ -211,7 +211,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
return mnt;
error:
- page_cache_release(page);
+ put_page(page);
error_no_page:
free_page((unsigned long) options);
error_no_options:
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 24a905b076fd..2853b4095344 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -230,14 +230,9 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
if (size <= 1 || size >= PAGE_SIZE)
return -EINVAL;
- kbuf = kmalloc(size + 1, GFP_KERNEL);
- if (!kbuf)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(kbuf, buf, size) != 0)
- goto done;
- kbuf[size] = 0;
+ kbuf = memdup_user_nul(buf, size);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
/* trim to first NL */
name = memchr(kbuf, '\n', size);
@@ -315,15 +310,9 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
if (size <= 1 || size >= PAGE_SIZE)
return -EINVAL;
- ret = -ENOMEM;
- kbuf = kmalloc(size + 1, GFP_KERNEL);
- if (!kbuf)
- goto nomem;
-
- ret = -EFAULT;
- if (copy_from_user(kbuf, buf, size) != 0)
- goto infault;
- kbuf[size] = 0;
+ kbuf = memdup_user_nul(buf, size);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
/* trim to first NL */
s = memchr(kbuf, '\n', size);
@@ -337,9 +326,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
if (ret >= 0)
ret = size; /* consume everything, always */
-infault:
kfree(kbuf);
-nomem:
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fb4a5129f7d..fbdb022b75a2 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -91,7 +91,7 @@ int __init afs_fs_init(void)
afs_inode_cachep = kmem_cache_create("afs_inode_cache",
sizeof(struct afs_vnode),
0,
- SLAB_HWCACHE_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
afs_i_init_once);
if (!afs_inode_cachep) {
printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
@@ -315,8 +315,8 @@ static int afs_fill_super(struct super_block *sb,
_enter("");
/* fill in the superblock */
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op = &afs_super_ops;
sb->s_bdi = &as->volume->bdi;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 0714abcd7f32..65de439bdc4f 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -93,10 +93,10 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
_enter(",,%llu", (unsigned long long)pos);
i_size = i_size_read(&vnode->vfs_inode);
- if (pos + PAGE_CACHE_SIZE > i_size)
+ if (pos + PAGE_SIZE > i_size)
len = i_size - pos;
else
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
ret = afs_vnode_fetch_data(vnode, key, pos, len, page);
if (ret < 0) {
@@ -123,9 +123,9 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
struct page *page;
struct key *key = file->private_data;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
int ret;
_enter("{%x:%u},{%lx},%u,%u",
@@ -151,8 +151,8 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
*pagep = page;
/* page won't leak in error case: it eventually gets cleaned off LRU */
- if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) {
- ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page);
+ if (!PageUptodate(page) && len != PAGE_SIZE) {
+ ret = afs_fill_page(vnode, key, index << PAGE_SHIFT, page);
if (ret < 0) {
kfree(candidate);
_leave(" = %d [prep]", ret);
@@ -266,7 +266,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
if (PageDirty(page))
_debug("dirtied");
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
@@ -480,7 +480,7 @@ static int afs_writepages_region(struct address_space *mapping,
if (page->index > end) {
*_next = index;
- page_cache_release(page);
+ put_page(page);
_leave(" = 0 [%lx]", *_next);
return 0;
}
@@ -494,7 +494,7 @@ static int afs_writepages_region(struct address_space *mapping,
if (page->mapping != mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
continue;
}
@@ -515,7 +515,7 @@ static int afs_writepages_region(struct address_space *mapping,
ret = afs_write_back_from_locked_page(wb, page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (ret < 0) {
_leave(" = %d", ret);
return ret;
@@ -551,13 +551,13 @@ int afs_writepages(struct address_space *mapping,
&next);
mapping->writeback_index = next;
} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT);
+ end = (pgoff_t)(LLONG_MAX >> PAGE_SHIFT);
ret = afs_writepages_region(mapping, wbc, 0, end, &next);
if (wbc->nr_to_write > 0)
mapping->writeback_index = next;
} else {
- start = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ start = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
ret = afs_writepages_region(mapping, wbc, start, end, &next);
}
@@ -693,7 +693,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* use a writeback record as a marker in the queue - when this reaches
* the front of the queue, all the outstanding writes are either
@@ -735,7 +735,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
afs_put_writeback(wb);
_leave(" = %d", ret);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/attr.c b/fs/attr.c
index 6530ced19697..25b24d0f6c88 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -195,7 +195,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
struct timespec now;
unsigned int ia_valid = attr->ia_valid;
- WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+ WARN_ON_ONCE(!inode_is_locked(inode));
if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c37149b929be..f0d268b97d19 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -1,15 +1,11 @@
-/* -*- c -*- ------------------------------------------------------------- *
- *
- * linux/fs/autofs/autofs_i.h
- *
- * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- * Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
+ */
/* Internal header file for autofs */
@@ -35,28 +31,23 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
/* #define DEBUG */
-#define DPRINTK(fmt, ...) \
- pr_debug("pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-#define AUTOFS_WARN(fmt, ...) \
- printk(KERN_WARNING "pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-#define AUTOFS_ERROR(fmt, ...) \
- printk(KERN_ERR "pid %d: %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__)
-
-/* Unified info structure. This is pointed to by both the dentry and
- inode structures. Each file in the filesystem has an instance of this
- structure. It holds a reference to the dentry, so dentries are never
- flushed while the file exists. All name lookups are dealt with at the
- dentry level, although the filesystem can interfere in the validation
- process. Readdir is implemented by traversing the dentry lists. */
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__
+
+/*
+ * Unified info structure. This is pointed to by both the dentry and
+ * inode structures. Each file in the filesystem has an instance of this
+ * structure. It holds a reference to the dentry, so dentries are never
+ * flushed while the file exists. All name lookups are dealt with at the
+ * dentry level, although the filesystem can interfere in the validation
+ * process. Readdir is implemented by traversing the dentry lists.
+ */
struct autofs_info {
struct dentry *dentry;
struct inode *inode;
@@ -78,7 +69,7 @@ struct autofs_info {
kgid_t gid;
};
-#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */
#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
* for expiry, so RCU_walk is
* not permitted
@@ -140,10 +131,11 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
}
/* autofs4_oz_mode(): do we see the man behind the curtain? (The
- processes which do manipulations for us in user space sees the raw
- filesystem without "magic".) */
-
-static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
+ * processes which do manipulations for us in user space sees the raw
+ * filesystem without "magic".)
+ */
+static inline int autofs4_oz_mode(struct autofs_sb_info *sbi)
+{
return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
@@ -154,12 +146,12 @@ void autofs4_free_ino(struct autofs_info *);
int is_autofs4_dentry(struct dentry *);
int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *,
- struct autofs_packet_expire __user *);
+ struct autofs_sb_info *,
+ struct autofs_packet_expire __user *);
int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int when);
int autofs4_expire_multi(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *, int __user *);
+ struct autofs_sb_info *, int __user *);
struct dentry *autofs4_expire_direct(struct super_block *sb,
struct vfsmount *mnt,
struct autofs_sb_info *sbi, int how);
@@ -224,8 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
/* Queue management functions */
-int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
-int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
+int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
+int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
void autofs4_catatonic_mode(struct autofs_sb_info *);
static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
@@ -242,37 +234,37 @@ static inline void __autofs4_add_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
}
- return;
}
static inline void autofs4_add_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
spin_lock(&sbi->lookup_lock);
if (list_empty(&ino->expiring))
list_add(&ino->expiring, &sbi->expiring_list);
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static inline void autofs4_del_expiring(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!list_empty(&ino->expiring))
list_del_init(&ino->expiring);
spin_unlock(&sbi->lookup_lock);
}
- return;
}
extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ac7d921ed984..c7fcc7438843 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -72,13 +72,13 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
{
int err = 0;
- if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
- (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
- AUTOFS_WARN("ioctl control interface version mismatch: "
- "kernel(%u.%u), user(%u.%u), cmd(%d)",
- AUTOFS_DEV_IOCTL_VERSION_MAJOR,
- AUTOFS_DEV_IOCTL_VERSION_MINOR,
- param->ver_major, param->ver_minor, cmd);
+ if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
+ (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
+ pr_warn("ioctl control interface version mismatch: "
+ "kernel(%u.%u), user(%u.%u), cmd(%d)\n",
+ AUTOFS_DEV_IOCTL_VERSION_MAJOR,
+ AUTOFS_DEV_IOCTL_VERSION_MINOR,
+ param->ver_major, param->ver_minor, cmd);
err = -EINVAL;
}
@@ -93,7 +93,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
* Copy parameter control struct, including a possible path allocated
* at the end of the struct.
*/
-static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+static struct autofs_dev_ioctl *
+ copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
struct autofs_dev_ioctl tmp, *res;
@@ -116,7 +117,6 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
{
kfree(param);
- return;
}
/*
@@ -129,24 +129,24 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
err = check_dev_ioctl_version(cmd, param);
if (err) {
- AUTOFS_WARN("invalid device control module version "
- "supplied for cmd(0x%08x)", cmd);
+ pr_warn("invalid device control module version "
+ "supplied for cmd(0x%08x)\n", cmd);
goto out;
}
if (param->size > sizeof(*param)) {
err = invalid_str(param->path, param->size - sizeof(*param));
if (err) {
- AUTOFS_WARN(
- "path string terminator missing for cmd(0x%08x)",
+ pr_warn(
+ "path string terminator missing for cmd(0x%08x)\n",
cmd);
goto out;
}
err = check_name(param->path);
if (err) {
- AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
- cmd);
+ pr_warn("invalid path supplied for cmd(0x%08x)\n",
+ cmd);
goto out;
}
}
@@ -197,7 +197,9 @@ static int find_autofs_mount(const char *pathname,
void *data)
{
struct path path;
- int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
+ int err;
+
+ err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
if (err)
return err;
err = -ENOENT;
@@ -225,6 +227,7 @@ static int test_by_dev(struct path *path, void *p)
static int test_by_type(struct path *path, void *p)
{
struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+
return ino && ino->sbi->type & *(unsigned *)p;
}
@@ -370,7 +373,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
new_pid = get_task_pid(current, PIDTYPE_PGID);
if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
- AUTOFS_WARN("Not allowed to change PID namespace");
+ pr_warn("not allowed to change PID namespace\n");
err = -EINVAL;
goto out;
}
@@ -456,8 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp,
err = 0;
autofs4_expire_wait(path.dentry, 0);
spin_lock(&sbi->fs_lock);
- param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
- param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
+ param->requester.uid =
+ from_kuid_munged(current_user_ns(), ino->uid);
+ param->requester.gid =
+ from_kgid_munged(current_user_ns(), ino->gid);
spin_unlock(&sbi->fs_lock);
}
path_put(&path);
@@ -619,7 +624,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
}
/* ioctl dispatcher */
-static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+static int _autofs_dev_ioctl(unsigned int command,
+ struct autofs_dev_ioctl __user *user)
{
struct autofs_dev_ioctl *param;
struct file *fp;
@@ -655,7 +661,7 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
fn = lookup_dev_ioctl(cmd);
if (!fn) {
- AUTOFS_WARN("unknown command 0x%08x", command);
+ pr_warn("unknown command 0x%08x\n", command);
return -ENOTTY;
}
@@ -711,6 +717,7 @@ out:
static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
{
int err;
+
err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
return (long) err;
}
@@ -733,8 +740,8 @@ static const struct file_operations _dev_ioctl_fops = {
static struct miscdevice _autofs_dev_ioctl_misc = {
.minor = AUTOFS_MINOR,
- .name = AUTOFS_DEVICE_NAME,
- .fops = &_dev_ioctl_fops
+ .name = AUTOFS_DEVICE_NAME,
+ .fops = &_dev_ioctl_fops
};
MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
@@ -747,7 +754,7 @@ int __init autofs_dev_ioctl_init(void)
r = misc_register(&_autofs_dev_ioctl_misc);
if (r) {
- AUTOFS_ERROR("misc_register failed for control device");
+ pr_err("misc_register failed for control device\n");
return r;
}
@@ -757,6 +764,4 @@ int __init autofs_dev_ioctl_init(void)
void autofs_dev_ioctl_exit(void)
{
misc_deregister(&_autofs_dev_ioctl_misc);
- return;
}
-
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1cebc3c52fa5..9510d8d2e9cd 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/expire.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include "autofs_i.h"
@@ -18,7 +14,7 @@ static unsigned long now;
/* Check if a dentry can be expired */
static inline int autofs4_can_expire(struct dentry *dentry,
- unsigned long timeout, int do_now)
+ unsigned long timeout, int do_now)
{
struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -41,7 +37,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
struct path path = {.mnt = mnt, .dentry = dentry};
int status = 1;
- DPRINTK("dentry %p %pd", dentry, dentry);
+ pr_debug("dentry %p %pd\n", dentry, dentry);
path_get(&path);
@@ -58,14 +54,16 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
/* Update the expiry counter if fs is busy */
if (!may_umount_tree(path.mnt)) {
- struct autofs_info *ino = autofs4_dentry_ino(top);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(top);
ino->last_used = jiffies;
goto done;
}
status = 0;
done:
- DPRINTK("returning = %d", status);
+ pr_debug("returning = %d\n", status);
path_put(&path);
return status;
}
@@ -74,7 +72,7 @@ done:
* Calculate and dget next entry in the subdirs list under root.
*/
static struct dentry *get_next_positive_subdir(struct dentry *prev,
- struct dentry *root)
+ struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
@@ -121,7 +119,7 @@ cont:
* Calculate and dget next entry in top down tree traversal.
*/
static struct dentry *get_next_positive_dentry(struct dentry *prev,
- struct dentry *root)
+ struct dentry *root)
{
struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
struct list_head *next;
@@ -187,15 +185,17 @@ again:
* autofs submounts.
*/
static int autofs4_direct_busy(struct vfsmount *mnt,
- struct dentry *top,
- unsigned long timeout,
- int do_now)
+ struct dentry *top,
+ unsigned long timeout,
+ int do_now)
{
- DPRINTK("top %p %pd", top, top);
+ pr_debug("top %p %pd\n", top, top);
/* If it's busy update the expiry counters */
if (!may_umount_tree(mnt)) {
- struct autofs_info *ino = autofs4_dentry_ino(top);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(top);
if (ino)
ino->last_used = jiffies;
return 1;
@@ -208,7 +208,8 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
return 0;
}
-/* Check a directory tree of mount points for busyness
+/*
+ * Check a directory tree of mount points for busyness
* The tree is not busy iff no mountpoints are busy
*/
static int autofs4_tree_busy(struct vfsmount *mnt,
@@ -219,7 +220,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
struct autofs_info *top_ino = autofs4_dentry_ino(top);
struct dentry *p;
- DPRINTK("top %p %pd", top, top);
+ pr_debug("top %p %pd\n", top, top);
/* Negative dentry - give up */
if (!simple_positive(top))
@@ -227,7 +228,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
p = NULL;
while ((p = get_next_positive_dentry(p, top))) {
- DPRINTK("dentry %p %pd", p, p);
+ pr_debug("dentry %p %pd\n", p, p);
/*
* Is someone visiting anywhere in the subtree ?
@@ -273,11 +274,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
{
struct dentry *p;
- DPRINTK("parent %p %pd", parent, parent);
+ pr_debug("parent %p %pd\n", parent, parent);
p = NULL;
while ((p = get_next_positive_dentry(p, parent))) {
- DPRINTK("dentry %p %pd", p, p);
+ pr_debug("dentry %p %pd\n", p, p);
if (d_mountpoint(p)) {
/* Can we umount this guy */
@@ -362,7 +363,7 @@ static struct dentry *should_expire(struct dentry *dentry,
* offset (autofs-5.0+).
*/
if (d_mountpoint(dentry)) {
- DPRINTK("checking mountpoint %p %pd", dentry, dentry);
+ pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
/* Can we umount this guy */
if (autofs4_mount_busy(mnt, dentry))
@@ -375,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry,
}
if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
- DPRINTK("checking symlink %p %pd", dentry, dentry);
+ pr_debug("checking symlink %p %pd\n", dentry, dentry);
/*
* A symlink can't be "busy" in the usual sense so
* just check last used for expire timeout.
@@ -404,6 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry,
} else {
/* Path walk currently on this dentry? */
struct dentry *expired;
+
ino_count = atomic_read(&ino->count) + 1;
if (d_count(dentry) > ino_count)
return NULL;
@@ -471,7 +473,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
return NULL;
found:
- DPRINTK("returning %p %pd", expired, expired);
+ pr_debug("returning %p %pd\n", expired, expired);
ino->flags |= AUTOFS_INF_EXPIRING;
smp_mb();
ino->flags &= ~AUTOFS_INF_NO_RCU;
@@ -503,12 +505,12 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
if (ino->flags & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
- DPRINTK("waiting for expire %p name=%pd", dentry, dentry);
+ pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
status = autofs4_wait(sbi, dentry, NFY_NONE);
wait_for_completion(&ino->expire_complete);
- DPRINTK("expire done status=%d", status);
+ pr_debug("expire done status=%d\n", status);
if (d_unhashed(dentry))
return -EAGAIN;
@@ -522,21 +524,22 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
/* Perform an expiry operation */
int autofs4_expire_run(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- struct autofs_packet_expire __user *pkt_p)
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ struct autofs_packet_expire __user *pkt_p)
{
struct autofs_packet_expire pkt;
struct autofs_info *ino;
struct dentry *dentry;
int ret = 0;
- memset(&pkt,0,sizeof pkt);
+ memset(&pkt, 0, sizeof(pkt));
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = autofs_ptype_expire;
- if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL)
+ dentry = autofs4_expire_indirect(sb, mnt, sbi, 0);
+ if (!dentry)
return -EAGAIN;
pkt.len = dentry->d_name.len;
@@ -544,7 +547,7 @@ int autofs4_expire_run(struct super_block *sb,
pkt.name[pkt.len] = '\0';
dput(dentry);
- if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
+ if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
ret = -EFAULT;
spin_lock(&sbi->fs_lock);
@@ -573,7 +576,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_info *ino = autofs4_dentry_ino(dentry);
/* This is synchronous because it makes the daemon a
- little easier */
+ * little easier
+ */
ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
@@ -588,8 +592,10 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
return ret;
}
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
- more to be done */
+/*
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more to be done.
+ */
int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int __user *arg)
{
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index b3db517e89ec..8cf0e63389ae 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/module.h>
#include <linux/init.h>
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index a3ae0b2aeb5a..61b21051bd5a 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -24,7 +20,9 @@
struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
{
- struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+ struct autofs_info *ino;
+
+ ino = kzalloc(sizeof(*ino), GFP_KERNEL);
if (ino) {
INIT_LIST_HEAD(&ino->active);
INIT_LIST_HEAD(&ino->expiring);
@@ -62,7 +60,7 @@ void autofs4_kill_sb(struct super_block *sb)
put_pid(sbi->oz_pgrp);
}
- DPRINTK("shutting down");
+ pr_debug("shutting down\n");
kill_litter_super(sb);
if (sbi)
kfree_rcu(sbi, rcu);
@@ -94,7 +92,12 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",direct");
else
seq_printf(m, ",indirect");
-
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (sbi->pipe)
+ seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+ else
+ seq_printf(m, ",pipe_ino=-1");
+#endif
return 0;
}
@@ -147,6 +150,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
while ((p = strsep(&options, ",")) != NULL) {
int token;
+
if (!*p)
continue;
@@ -204,9 +208,9 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
int autofs4_fill_super(struct super_block *s, void *data, int silent)
{
- struct inode * root_inode;
- struct dentry * root;
- struct file * pipe;
+ struct inode *root_inode;
+ struct dentry *root;
+ struct file *pipe;
int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
@@ -217,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
- DPRINTK("starting up, sbi = %p",sbi);
+ pr_debug("starting up, sbi = %p\n", sbi);
s->s_fs_info = sbi;
sbi->magic = AUTOFS_SBI_MAGIC;
@@ -266,14 +270,14 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
&pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
&sbi->max_proto)) {
- printk("autofs: called with bogus options\n");
+ pr_err("called with bogus options\n");
goto fail_dput;
}
if (pgrp_set) {
sbi->oz_pgrp = find_get_pid(pgrp);
if (!sbi->oz_pgrp) {
- pr_warn("autofs: could not find process group %d\n",
+ pr_err("could not find process group %d\n",
pgrp);
goto fail_dput;
}
@@ -290,10 +294,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
/* Couldn't this be tested earlier? */
if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION ||
sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) {
- printk("autofs: kernel does not match daemon version "
+ pr_err("kernel does not match daemon version "
"daemon (%d, %d) kernel (%d, %d)\n",
- sbi->min_proto, sbi->max_proto,
- AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
+ sbi->min_proto, sbi->max_proto,
+ AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION);
goto fail_dput;
}
@@ -304,11 +308,11 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->version = sbi->max_proto;
sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
- DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
+ pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
if (!pipe) {
- printk("autofs: could not open pipe file descriptor\n");
+ pr_err("could not open pipe file descriptor\n");
goto fail_dput;
}
ret = autofs_prepare_pipe(pipe);
@@ -323,12 +327,12 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
*/
s->s_root = root;
return 0;
-
+
/*
* Failure ... clean up.
*/
fail_fput:
- printk("autofs: pipe file descriptor does not contain proper ops\n");
+ pr_err("pipe file descriptor does not contain proper ops\n");
fput(pipe);
/* fall through */
fail_dput:
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c6d7d3dbd52a..7ab923940d18 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/capability.h>
#include <linux/errno.h>
@@ -23,16 +19,18 @@
#include "autofs_i.h"
-static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
-static int autofs4_dir_unlink(struct inode *,struct dentry *);
-static int autofs4_dir_rmdir(struct inode *,struct dentry *);
-static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
-static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *);
+static int autofs4_dir_unlink(struct inode *, struct dentry *);
+static int autofs4_dir_rmdir(struct inode *, struct dentry *);
+static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t);
+static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
-static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs4_root_compat_ioctl(struct file *,
+ unsigned int, unsigned long);
#endif
static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
+static struct dentry *autofs4_lookup(struct inode *,
+ struct dentry *, unsigned int);
static struct vfsmount *autofs4_d_automount(struct path *);
static int autofs4_d_manage(struct dentry *, bool);
static void autofs4_dentry_release(struct dentry *);
@@ -74,7 +72,9 @@ const struct dentry_operations autofs4_dentry_operations = {
static void autofs4_add_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!ino->active_count) {
@@ -84,13 +84,14 @@ static void autofs4_add_active(struct dentry *dentry)
ino->active_count++;
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static void autofs4_del_active(struct dentry *dentry)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino;
+
+ ino = autofs4_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
ino->active_count--;
@@ -100,7 +101,6 @@ static void autofs4_del_active(struct dentry *dentry)
}
spin_unlock(&sbi->lookup_lock);
}
- return;
}
static int autofs4_dir_open(struct inode *inode, struct file *file)
@@ -108,7 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
struct dentry *dentry = file->f_path.dentry;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry);
+ pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
if (autofs4_oz_mode(sbi))
goto out;
@@ -138,7 +138,7 @@ static void autofs4_dentry_release(struct dentry *de)
struct autofs_info *ino = autofs4_dentry_ino(de);
struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
- DPRINTK("releasing %p", de);
+ pr_debug("releasing %p\n", de);
if (!ino)
return;
@@ -278,9 +278,9 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
if (ino->flags & AUTOFS_INF_PENDING) {
if (rcu_walk)
return -ECHILD;
- DPRINTK("waiting for mount name=%pd", dentry);
+ pr_debug("waiting for mount name=%pd\n", dentry);
status = autofs4_wait(sbi, dentry, NFY_MOUNT);
- DPRINTK("mount wait done status=%d", status);
+ pr_debug("mount wait done status=%d\n", status);
}
ino->last_used = jiffies;
return status;
@@ -320,7 +320,9 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
struct dentry *parent = dentry->d_parent;
struct autofs_info *ino;
- struct dentry *new = d_lookup(parent, &dentry->d_name);
+ struct dentry *new;
+
+ new = d_lookup(parent, &dentry->d_name);
if (!new)
return NULL;
ino = autofs4_dentry_ino(new);
@@ -338,7 +340,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %pd", dentry, dentry);
+ pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never triggers a mount. */
if (autofs4_oz_mode(sbi))
@@ -425,7 +427,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
- DPRINTK("dentry=%p %pd", dentry, dentry);
+ pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never waits. */
if (autofs4_oz_mode(sbi)) {
@@ -455,6 +457,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
* a mount-trap.
*/
struct inode *inode;
+
if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
return 0;
if (d_mountpoint(dentry))
@@ -494,13 +497,14 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
}
/* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+static struct dentry *autofs4_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
struct dentry *active;
- DPRINTK("name = %pd", dentry);
+ pr_debug("name = %pd\n", dentry);
/* File name too long to exist */
if (dentry->d_name.len > NAME_MAX)
@@ -508,14 +512,14 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
sbi = autofs4_sbi(dir->i_sb);
- DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
- current->pid, task_pgrp_nr(current), sbi->catatonic,
- autofs4_oz_mode(sbi));
+ pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
+ current->pid, task_pgrp_nr(current), sbi->catatonic,
+ autofs4_oz_mode(sbi));
active = autofs4_lookup_active(dentry);
- if (active) {
+ if (active)
return active;
- } else {
+ else {
/*
* A dentry that is not within the root can never trigger a
* mount operation, unless the directory already exists, so we
@@ -526,7 +530,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
return ERR_PTR(-ENOENT);
/* Mark entries in the root as mount triggers */
- if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+ if (IS_ROOT(dentry->d_parent) &&
+ autofs_type_indirect(sbi->type))
__managed_dentry_set_managed(dentry);
ino = autofs4_new_ino(sbi);
@@ -537,8 +542,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
ino->dentry = dentry;
autofs4_add_active(dentry);
-
- d_instantiate(dentry, NULL);
}
return NULL;
}
@@ -554,7 +557,7 @@ static int autofs4_dir_symlink(struct inode *dir,
size_t size = strlen(symname);
char *cp;
- DPRINTK("%s <- %pd", symname, dentry);
+ pr_debug("%s <- %pd\n", symname, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
@@ -613,7 +616,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
-
+
/* This allows root to remove symlinks */
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -664,7 +667,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry)
if (IS_ROOT(parent->d_parent))
return;
managed_dentry_clear_managed(parent);
- return;
}
static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
@@ -687,7 +689,6 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
if (d_child->next == &parent->d_subdirs &&
d_child->prev == &parent->d_subdirs)
managed_dentry_set_managed(parent);
- return;
}
static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
@@ -695,8 +696,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
struct autofs_info *p_ino;
-
- DPRINTK("dentry %p, removing %pd", dentry, dentry);
+
+ pr_debug("dentry %p, removing %pd\n", dentry, dentry);
if (!autofs4_oz_mode(sbi))
return -EACCES;
@@ -728,7 +729,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
return 0;
}
-static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int autofs4_dir_mkdir(struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -738,7 +740,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
if (!autofs4_oz_mode(sbi))
return -EACCES;
- DPRINTK("dentry %p, creating %pd", dentry, dentry);
+ pr_debug("dentry %p, creating %pd\n", dentry, dentry);
BUG_ON(!ino);
@@ -768,14 +770,18 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
/* Get/set timeout ioctl() operation */
#ifdef CONFIG_COMPAT
static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
- compat_ulong_t __user *p)
+ compat_ulong_t __user *p)
{
- int rv;
unsigned long ntimeout;
+ int rv;
+
+ rv = get_user(ntimeout, p);
+ if (rv)
+ goto error;
- if ((rv = get_user(ntimeout, p)) ||
- (rv = put_user(sbi->exp_timeout/HZ, p)))
- return rv;
+ rv = put_user(sbi->exp_timeout/HZ, p);
+ if (rv)
+ goto error;
if (ntimeout > UINT_MAX/HZ)
sbi->exp_timeout = 0;
@@ -783,18 +789,24 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
sbi->exp_timeout = ntimeout * HZ;
return 0;
+error:
+ return rv;
}
#endif
static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
- unsigned long __user *p)
+ unsigned long __user *p)
{
- int rv;
unsigned long ntimeout;
+ int rv;
+
+ rv = get_user(ntimeout, p);
+ if (rv)
+ goto error;
- if ((rv = get_user(ntimeout, p)) ||
- (rv = put_user(sbi->exp_timeout/HZ, p)))
- return rv;
+ rv = put_user(sbi->exp_timeout/HZ, p);
+ if (rv)
+ goto error;
if (ntimeout > ULONG_MAX/HZ)
sbi->exp_timeout = 0;
@@ -802,16 +814,20 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
sbi->exp_timeout = ntimeout * HZ;
return 0;
+error:
+ return rv;
}
/* Return protocol version */
-static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protover(struct autofs_sb_info *sbi,
+ int __user *p)
{
return put_user(sbi->version, p);
}
/* Return protocol sub version */
-static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
+ int __user *p)
{
return put_user(sbi->sub_version, p);
}
@@ -826,7 +842,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
if (may_umount(mnt))
status = 1;
- DPRINTK("returning %d", status);
+ pr_debug("returning %d\n", status);
status = put_user(status, p);
@@ -834,9 +850,9 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
}
/* Identify autofs4_dentries - this is so we can tell if there's
- an extra dentry refcount or not. We only hold a refcount on the
- dentry if its non-negative (ie, d_inode != NULL)
-*/
+ * an extra dentry refcount or not. We only hold a refcount on the
+ * dentry if its non-negative (ie, d_inode != NULL)
+ */
int is_autofs4_dentry(struct dentry *dentry)
{
return dentry && d_really_is_positive(dentry) &&
@@ -854,21 +870,21 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
void __user *p = (void __user *)arg;
- DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
- cmd,arg,sbi,task_pgrp_nr(current));
+ pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",
+ cmd, arg, sbi, task_pgrp_nr(current));
if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
_IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
return -ENOTTY;
-
+
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
-
- switch(cmd) {
+
+ switch (cmd) {
case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
- return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0);
+ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
- return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
+ return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
autofs4_catatonic_mode(sbi);
return 0;
@@ -888,13 +904,15 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
/* return a single thing to expire */
case AUTOFS_IOC_EXPIRE:
- return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p);
+ return autofs4_expire_run(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
/* same as above, but can send multiple expires through pipe */
case AUTOFS_IOC_EXPIRE_MULTI:
- return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p);
+ return autofs4_expire_multi(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
default:
- return -ENOSYS;
+ return -EINVAL;
}
}
@@ -902,12 +920,13 @@ static long autofs4_root_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
+
return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
}
#ifdef CONFIG_COMPAT
static long autofs4_root_compat_ioctl(struct file *filp,
- unsigned int cmd, unsigned long arg)
+ unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
int ret;
@@ -916,7 +935,7 @@ static long autofs4_root_compat_ioctl(struct file *filp,
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
else
ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
- (unsigned long)compat_ptr(arg));
+ (unsigned long) compat_ptr(arg));
return ret;
}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index da0c33481bc0..99aab00dc217 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -1,21 +1,24 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include "autofs_i.h"
-static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
+static const char *autofs4_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi;
+ struct autofs_info *ino;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ sbi = autofs4_sbi(dentry->d_sb);
+ ino = autofs4_dentry_ino(dentry);
if (ino && !autofs4_oz_mode(sbi))
ino->last_used = jiffies;
return d_inode(dentry)->i_private;
@@ -23,5 +26,5 @@ static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
const struct inode_operations autofs4_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = autofs4_follow_link
+ .get_link = autofs4_get_link
};
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 35b755e79c2d..0146d911f468 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- * Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
#include <linux/slab.h>
#include <linux/time.h>
@@ -18,7 +14,8 @@
#include "autofs_i.h"
/* We make this a static variable rather than a part of the superblock; it
- is better if we don't reassign numbers easily even across filesystems */
+ * is better if we don't reassign numbers easily even across filesystems
+ */
static autofs_wqt_t autofs4_next_wait_queue = 1;
/* These are the signals we allow interrupting a pending mount */
@@ -34,7 +31,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
return;
}
- DPRINTK("entering catatonic mode");
+ pr_debug("entering catatonic mode\n");
sbi->catatonic = 1;
wq = sbi->queues;
@@ -69,17 +66,19 @@ static int autofs4_write(struct autofs_sb_info *sbi,
set_fs(KERNEL_DS);
mutex_lock(&sbi->pipe_mutex);
- while (bytes &&
- (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) {
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
+ while (bytes && wr) {
data += wr;
bytes -= wr;
+ wr = __vfs_write(file, data, bytes, &file->f_pos);
}
mutex_unlock(&sbi->pipe_mutex);
set_fs(fs);
/* Keep the currently executing process from receiving a
- SIGPIPE unless it was already supposed to get one */
+ * SIGPIPE unless it was already supposed to get one
+ */
if (wr == -EPIPE && !sigpipe) {
spin_lock_irqsave(&current->sighand->siglock, flags);
sigdelset(&current->pending.signal, SIGPIPE);
@@ -89,7 +88,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
return (bytes > 0);
}
-
+
static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct autofs_wait_queue *wq,
int type)
@@ -102,10 +101,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
struct file *pipe = NULL;
size_t pktsz;
- DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
- (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
+ pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
+ (unsigned long) wq->wait_queue_token,
+ wq->name.len, wq->name.name, type);
- memset(&pkt,0,sizeof pkt); /* For security reasons */
+ memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = type;
@@ -126,7 +126,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
}
case autofs_ptype_expire_multi:
{
- struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi;
+ struct autofs_packet_expire_multi *ep =
+ &pkt.v4_pkt.expire_multi;
pktsz = sizeof(*ep);
@@ -163,7 +164,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
break;
}
default:
- printk("autofs4_notify_daemon: bad type %d!\n", type);
+ pr_warn("bad type %d!\n", type);
mutex_unlock(&sbi->wq_mutex);
return;
}
@@ -231,7 +232,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
if (wq->name.hash == qstr->hash &&
wq->name.len == qstr->len &&
wq->name.name &&
- !memcmp(wq->name.name, qstr->name, qstr->len))
+ !memcmp(wq->name.name, qstr->name, qstr->len))
break;
}
return wq;
@@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
static int validate_request(struct autofs_wait_queue **wait,
struct autofs_sb_info *sbi,
struct qstr *qstr,
- struct dentry*dentry, enum autofs_notify notify)
+ struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
struct autofs_info *ino;
@@ -322,8 +323,10 @@ static int validate_request(struct autofs_wait_queue **wait,
* continue on and create a new request.
*/
if (!IS_ROOT(dentry)) {
- if (d_really_is_positive(dentry) && d_unhashed(dentry)) {
+ if (d_unhashed(dentry) &&
+ d_really_is_positive(dentry)) {
struct dentry *parent = dentry->d_parent;
+
new = d_lookup(parent, &dentry->d_name);
if (new)
dentry = new;
@@ -340,8 +343,8 @@ static int validate_request(struct autofs_wait_queue **wait,
return 1;
}
-int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
- enum autofs_notify notify)
+int autofs4_wait(struct autofs_sb_info *sbi,
+ struct dentry *dentry, enum autofs_notify notify)
{
struct autofs_wait_queue *wq;
struct qstr qstr;
@@ -411,7 +414,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
if (!wq) {
/* Create a new wait queue */
- wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
+ wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
if (!wq) {
kfree(qstr.name);
mutex_unlock(&sbi->wq_mutex);
@@ -450,17 +453,19 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
autofs_ptype_expire_indirect;
}
- DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
- (unsigned long) wq->wait_queue_token, wq->name.len,
- wq->name.name, notify);
+ pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
- /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
+ /*
+ * autofs4_notify_daemon() may block; it will unlock ->wq_mutex
+ */
autofs4_notify_daemon(sbi, wq, type);
} else {
wq->wait_ctr++;
- DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
- (unsigned long) wq->wait_queue_token, wq->name.len,
- wq->name.name, notify);
+ pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
mutex_unlock(&sbi->wq_mutex);
kfree(qstr.name);
}
@@ -471,12 +476,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
*/
if (wq->name.name) {
/* Block all but "shutdown" signals while waiting */
- sigset_t oldset;
+ unsigned long shutdown_sigs_mask;
unsigned long irqflags;
+ sigset_t oldset;
spin_lock_irqsave(&current->sighand->siglock, irqflags);
oldset = current->blocked;
- siginitsetinv(&current->blocked, SHUTDOWN_SIGS & ~oldset.sig[0]);
+ shutdown_sigs_mask = SHUTDOWN_SIGS & ~oldset.sig[0];
+ siginitsetinv(&current->blocked, shutdown_sigs_mask);
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
@@ -487,7 +494,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
} else {
- DPRINTK("skipped sleeping");
+ pr_debug("skipped sleeping\n");
}
status = wq->status;
@@ -562,4 +569,3 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
return 0;
}
-
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 861b1e1c4777..103f5d7c3083 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -192,7 +192,7 @@ EXPORT_SYMBOL(make_bad_inode);
* Returns true if the inode in question has been marked as bad.
*/
-int is_bad_inode(struct inode *inode)
+bool is_bad_inode(struct inode *inode)
{
return (inode->i_op == &bad_inode_ops);
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 46aedacfa6a8..cc0e08252913 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_destroy_inode(struct inode *inode);
static void befs_destroy_inodecache(void);
-static const char *befs_follow_link(struct dentry *, void **);
+static int befs_symlink_readpage(struct file *, struct page *);
static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -79,10 +79,8 @@ static const struct address_space_operations befs_aops = {
.bmap = befs_bmap,
};
-static const struct inode_operations befs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = befs_follow_link,
- .put_link = kfree_put_link,
+static const struct address_space_operations befs_symlink_aops = {
+ .readpage = befs_symlink_readpage,
};
/*
@@ -398,7 +396,9 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &befs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
- inode->i_op = &befs_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &befs_symlink_aops;
} else {
inode->i_link = befs_ino->i_data.symlink;
inode->i_op = &simple_symlink_inode_operations;
@@ -434,7 +434,7 @@ befs_init_inodecache(void)
befs_inode_cachep = kmem_cache_create("befs_inode_cache",
sizeof (struct befs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (befs_inode_cachep == NULL) {
pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
@@ -463,31 +463,33 @@ befs_destroy_inodecache(void)
* The data stream become link name. Unless the LONG_SYMLINK
* flag is set.
*/
-static const char *
-befs_follow_link(struct dentry *dentry, void **cookie)
+static int befs_symlink_readpage(struct file *unused, struct page *page)
{
- struct super_block *sb = dentry->d_sb;
- struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
+ struct inode *inode = page->mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct befs_inode_info *befs_ino = BEFS_I(inode);
befs_data_stream *data = &befs_ino->i_data.ds;
befs_off_t len = data->size;
- char *link;
+ char *link = page_address(page);
- if (len == 0) {
+ if (len == 0 || len > PAGE_SIZE) {
befs_error(sb, "Long symlink with illegal length");
- return ERR_PTR(-EIO);
+ goto fail;
}
befs_debug(sb, "Follow long symlink");
- link = kmalloc(len, GFP_NOFS);
- if (!link)
- return ERR_PTR(-ENOMEM);
if (befs_read_lsymlink(sb, data, link, len) != len) {
- kfree(link);
befs_error(sb, "Failed to read entire long symlink");
- return ERR_PTR(-EIO);
+ goto fail;
}
link[len - 1] = '\0';
- return *cookie = link;
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+fail:
+ SetPageError(page);
+ unlock_page(page);
+ return -EIO;
}
/*
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fdcb4d69f430..1e5c896f6b79 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -270,7 +270,7 @@ static int __init init_inodecache(void)
bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
sizeof(struct bfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (bfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6b659967898e..81381cc0dd17 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -35,6 +35,7 @@
#include <linux/utsname.h>
#include <linux/coredump.h>
#include <linux/sched.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -487,9 +488,10 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
}
/**
- * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * arch_check_elf() - check an ELF executable
* @ehdr: The main ELF header
* @has_interp: True if the ELF has an interpreter, else false.
+ * @interp_ehdr: The interpreter's ELF header
* @state: Architecture-specific state preserved throughout the process
* of loading the ELF.
*
@@ -501,6 +503,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
* with that return code.
*/
static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
+ struct elfhdr *interp_ehdr,
struct arch_elf_state *state)
{
/* Dummy implementation, always proceed */
@@ -650,7 +653,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
- random_variable = (unsigned long) get_random_int();
+ random_variable = get_random_long();
random_variable &= STACK_RND_MASK;
random_variable <<= PAGE_SHIFT;
}
@@ -759,16 +762,16 @@ static int load_elf_binary(struct linux_binprm *bprm)
*/
would_dump(bprm, interpreter);
- retval = kernel_read(interpreter, 0, bprm->buf,
- BINPRM_BUF_SIZE);
- if (retval != BINPRM_BUF_SIZE) {
+ /* Get the exec headers */
+ retval = kernel_read(interpreter, 0,
+ (void *)&loc->interp_elf_ex,
+ sizeof(loc->interp_elf_ex));
+ if (retval != sizeof(loc->interp_elf_ex)) {
if (retval >= 0)
retval = -EIO;
goto out_free_dentry;
}
- /* Get the exec headers */
- loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
break;
}
elf_ppnt++;
@@ -828,7 +831,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
* still possible to return an error to the code that invoked
* the exec syscall.
*/
- retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state);
+ retval = arch_check_elf(&loc->elf_ex,
+ !!interpreter, &loc->interp_elf_ex,
+ &arch_state);
if (retval)
goto out_free_dentry;
@@ -1236,6 +1241,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
if (vma->vm_flags & VM_DONTDUMP)
return 0;
+ /* support for DAX */
+ if (vma_is_dax(vma)) {
+ if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
+ goto whole;
+ if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
+ goto whole;
+ return 0;
+ }
+
/* Hugetlb memory check */
if (vma->vm_flags & VM_HUGETLB) {
if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
@@ -2278,7 +2292,7 @@ static int elf_core_dump(struct coredump_params *cprm)
void *kaddr = kmap(page);
stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
kunmap(page);
- page_cache_release(page);
+ put_page(page);
} else
stop = !dump_skip(cprm, PAGE_SIZE);
if (stop)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d3634bfb7fe1..083ea2bc60ab 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -35,6 +35,7 @@
#include <linux/elf-fdpic.h>
#include <linux/elfcore.h>
#include <linux/coredump.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include <asm/param.h>
@@ -103,19 +104,36 @@ static void __exit exit_elf_fdpic_binfmt(void)
core_initcall(init_elf_fdpic_binfmt);
module_exit(exit_elf_fdpic_binfmt);
-static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
+static int is_elf(struct elfhdr *hdr, struct file *file)
{
if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0)
return 0;
if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)
return 0;
- if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr))
+ if (!elf_check_arch(hdr))
return 0;
if (!file->f_op->mmap)
return 0;
return 1;
}
+#ifndef elf_check_fdpic
+#define elf_check_fdpic(x) 0
+#endif
+
+#ifndef elf_check_const_displacement
+#define elf_check_const_displacement(x) 0
+#endif
+
+static int is_constdisp(struct elfhdr *hdr)
+{
+ if (!elf_check_fdpic(hdr))
+ return 1;
+ if (elf_check_const_displacement(hdr))
+ return 1;
+ return 0;
+}
+
/*****************************************************************************/
/*
* read the program headers table into memory
@@ -191,8 +209,18 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* check that this is a binary we know how to deal with */
retval = -ENOEXEC;
- if (!is_elf_fdpic(&exec_params.hdr, bprm->file))
+ if (!is_elf(&exec_params.hdr, bprm->file))
+ goto error;
+ if (!elf_check_fdpic(&exec_params.hdr)) {
+#ifdef CONFIG_MMU
+ /* binfmt_elf handles non-fdpic elf except on nommu */
goto error;
+#else
+ /* nommu can only load ET_DYN (PIE) ELF */
+ if (exec_params.hdr.e_type != ET_DYN)
+ goto error;
+#endif
+ }
/* read the program header table */
retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file);
@@ -269,13 +297,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
}
- if (elf_check_const_displacement(&exec_params.hdr))
+ if (is_constdisp(&exec_params.hdr))
exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
/* perform insanity checks on the interpreter */
if (interpreter_name) {
retval = -ELIBBAD;
- if (!is_elf_fdpic(&interp_params.hdr, interpreter))
+ if (!is_elf(&interp_params.hdr, interpreter))
goto error;
interp_params.flags = ELF_FDPIC_FLAG_PRESENT;
@@ -306,9 +334,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
retval = -ENOEXEC;
if (stack_size == 0)
- goto error;
+ stack_size = 131072UL; /* same as exec.c's default commit */
- if (elf_check_const_displacement(&interp_params.hdr))
+ if (is_constdisp(&interp_params.hdr))
interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
/* flush all traces of the currently running executable */
@@ -319,7 +347,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* there's now no turning back... the old userspace image is dead,
* defunct, deceased, etc.
*/
- set_personality(PER_LINUX_FDPIC);
+ if (elf_check_fdpic(&exec_params.hdr))
+ set_personality(PER_LINUX_FDPIC);
+ else
+ set_personality(PER_LINUX);
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -374,10 +405,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
PAGE_ALIGN(current->mm->start_brk);
#else
- /* create a stack and brk area big enough for everyone
- * - the brk heap starts at the bottom and works up
- * - the stack starts at the top and works down
- */
+ /* create a stack area and zero-size brk area */
stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK;
if (stack_size < PAGE_SIZE * 2)
stack_size = PAGE_SIZE * 2;
@@ -400,8 +428,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
current->mm->brk = current->mm->start_brk;
current->mm->context.end_brk = current->mm->start_brk;
- current->mm->context.end_brk +=
- (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0;
current->mm->start_stack = current->mm->start_brk + stack_size;
#endif
@@ -1206,6 +1232,20 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
return 0;
}
+ /* support for DAX */
+ if (vma_is_dax(vma)) {
+ if (vma->vm_flags & VM_SHARED) {
+ dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags);
+ kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start,
+ vma->vm_flags, dump_ok ? "yes" : "no");
+ } else {
+ dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags);
+ kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start,
+ vma->vm_flags, dump_ok ? "yes" : "no");
+ }
+ return dump_ok;
+ }
+
/* By default, dump shared memory if mapped from an anonymous file. */
if (vma->vm_flags & VM_SHARED) {
if (file_inode(vma->vm_file)->i_nlink == 0) {
@@ -1493,7 +1533,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
void *kaddr = kmap(page);
res = dump_emit(cprm, kaddr, PAGE_SIZE);
kunmap(page);
- page_cache_release(page);
+ put_page(page);
} else {
res = dump_skip(cprm, PAGE_SIZE);
}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 78f005f37847..3a3ced779fc7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
case 3:
/* Delete this handler. */
root = dget(file->f_path.dentry->d_sb->s_root);
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
kill_node(e);
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
dput(root);
break;
default:
@@ -675,7 +675,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
return PTR_ERR(e);
root = dget(sb->s_root);
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
dentry = lookup_one_len(e->name, root, strlen(e->name));
err = PTR_ERR(dentry);
if (IS_ERR(dentry))
@@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
out2:
dput(dentry);
out:
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
dput(root);
if (err) {
@@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
case 3:
/* Delete all handlers. */
root = dget(file->f_path.dentry->d_sb->s_root);
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
while (!list_empty(&entries))
kill_node(list_entry(entries.next, Node, list));
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
dput(root);
break;
default:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0a793c7930eb..20a2c02b77c4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -50,12 +50,21 @@ struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);
-static void bdev_write_inode(struct inode *inode)
+static void bdev_write_inode(struct block_device *bdev)
{
+ struct inode *inode = bdev->bd_inode;
+ int ret;
+
spin_lock(&inode->i_lock);
while (inode->i_state & I_DIRTY) {
spin_unlock(&inode->i_lock);
- WARN_ON_ONCE(write_inode_now(inode, true));
+ ret = write_inode_now(inode, true);
+ if (ret) {
+ char name[BDEVNAME_SIZE];
+ pr_warn_ratelimited("VFS: Dirty inode writeback failed "
+ "for block device %s (err=%d).\n",
+ bdevname(bdev, name), ret);
+ }
spin_lock(&inode->i_lock);
}
spin_unlock(&inode->i_lock);
@@ -66,7 +75,7 @@ void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
- if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+ if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
return;
invalidate_bh_lrus();
@@ -147,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
+static struct inode *bdev_file_inode(struct file *file)
+{
+ return file->f_mapping->host;
+}
+
static ssize_t
blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = bdev_file_inode(file);
if (IS_DAX(inode))
return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
@@ -317,7 +331,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return ret;
}
@@ -329,18 +343,18 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
*/
static loff_t block_llseek(struct file *file, loff_t offset, int whence)
{
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t retval;
- mutex_lock(&bd_inode->i_mutex);
+ inode_lock(bd_inode);
retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
- mutex_unlock(&bd_inode->i_mutex);
+ inode_unlock(bd_inode);
return retval;
}
int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
- struct inode *bd_inode = filp->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(filp);
struct block_device *bdev = I_BDEV(bd_inode);
int error;
@@ -381,9 +395,17 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
struct page *page)
{
const struct block_device_operations *ops = bdev->bd_disk->fops;
+ int result = -EOPNOTSUPP;
+
if (!ops->rw_page || bdev_get_integrity(bdev))
- return -EOPNOTSUPP;
- return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+ return result;
+
+ result = blk_queue_enter(bdev->bd_queue, false);
+ if (result)
+ return result;
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+ blk_queue_exit(bdev->bd_queue);
+ return result;
}
EXPORT_SYMBOL_GPL(bdev_read_page);
@@ -412,14 +434,20 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
int result;
int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
const struct block_device_operations *ops = bdev->bd_disk->fops;
+
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
+ result = blk_queue_enter(bdev->bd_queue, false);
+ if (result)
+ return result;
+
set_page_writeback(page);
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
if (result)
end_page_writeback(page);
else
unlock_page(page);
+ blk_queue_exit(bdev->bd_queue);
return result;
}
EXPORT_SYMBOL_GPL(bdev_write_page);
@@ -427,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
/**
* bdev_direct_access() - Get the address for directly-accessibly memory
* @bdev: The device containing the memory
- * @sector: The offset within the device
- * @addr: Where to put the address of the memory
- * @pfn: The Page Frame Number for the memory
- * @size: The number of bytes requested
+ * @dax: control and output parameters for ->direct_access
*
* If a block device is made up of directly addressable memory, this function
* will tell the caller the PFN and the address of the memory. The address
@@ -441,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
* Return: negative errno if an error occurs, otherwise the number of bytes
* accessible at this address.
*/
-long bdev_direct_access(struct block_device *bdev, sector_t sector,
- void __pmem **addr, unsigned long *pfn, long size)
+long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
{
- long avail;
+ sector_t sector = dax->sector;
+ long avail, size = dax->size;
const struct block_device_operations *ops = bdev->bd_disk->fops;
/*
@@ -463,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
- avail = ops->direct_access(bdev, sector, addr, pfn);
+ avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
if (!avail)
return -ERANGE;
+ if (avail > 0 && avail & ~PAGE_MASK)
+ return -ENXIO;
return min(avail, size);
}
EXPORT_SYMBOL_GPL(bdev_direct_access);
@@ -548,7 +575,11 @@ static const struct super_operations bdev_sops = {
static struct dentry *bd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+ struct dentry *dent;
+ dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+ if (dent)
+ dent->d_sb->s_iflags |= SB_I_CGROUPWB;
+ return dent;
}
static struct file_system_type bd_type = {
@@ -567,7 +598,7 @@ void __init bdev_cache_init(void)
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)
@@ -673,7 +704,7 @@ static struct block_device *bd_acquire(struct inode *inode)
spin_lock(&bdev_lock);
bdev = inode->i_bdev;
if (bdev) {
- ihold(bdev->bd_inode);
+ bdgrab(bdev);
spin_unlock(&bdev_lock);
return bdev;
}
@@ -689,7 +720,7 @@ static struct block_device *bd_acquire(struct inode *inode)
* So, we can access it via ->i_mapping always
* without igrab().
*/
- ihold(bdev->bd_inode);
+ bdgrab(bdev);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -712,7 +743,7 @@ void bd_forget(struct inode *inode)
spin_unlock(&bdev_lock);
if (bdev)
- iput(bdev->bd_inode);
+ bdput(bdev);
}
/**
@@ -1019,12 +1050,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
static void flush_disk(struct block_device *bdev, bool kill_dirty)
{
if (__invalidate_device(bdev, kill_dirty)) {
- char name[BDEVNAME_SIZE] = "";
-
- if (bdev->bd_disk)
- disk_name(bdev->bd_disk, 0, name);
printk(KERN_WARNING "VFS: busy inodes on changed media or "
- "resized disk %s\n", name);
+ "resized disk %s\n",
+ bdev->bd_disk ? bdev->bd_disk->disk_name : "");
}
if (!bdev->bd_disk)
@@ -1048,12 +1076,9 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
disk_size = (loff_t)get_capacity(disk) << 9;
bdev_size = i_size_read(bdev->bd_inode);
if (disk_size != bdev_size) {
- char name[BDEVNAME_SIZE];
-
- disk_name(disk, 0, name);
printk(KERN_INFO
"%s: detected capacity change from %lld to %lld\n",
- name, bdev_size, disk_size);
+ disk->disk_name, bdev_size, disk_size);
i_size_write(bdev->bd_inode, disk_size);
flush_disk(bdev, false);
}
@@ -1121,10 +1146,10 @@ void bd_set_size(struct block_device *bdev, loff_t size)
{
unsigned bsize = bdev_logical_block_size(bdev);
- mutex_lock(&bdev->bd_inode->i_mutex);
+ inode_lock(bdev->bd_inode);
i_size_write(bdev->bd_inode, size);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- while (bsize < PAGE_CACHE_SIZE) {
+ inode_unlock(bdev->bd_inode);
+ while (bsize < PAGE_SIZE) {
if (size & bsize)
break;
bsize <<= 1;
@@ -1180,7 +1205,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
+ if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
+ bdev->bd_inode->i_flags = S_DAX;
+ else
+ bdev->bd_inode->i_flags = 0;
+
if (!partno) {
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
@@ -1207,8 +1236,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret)
+ if (!ret) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ if (!blkdev_dax_capable(bdev))
+ bdev->bd_inode->i_flags &= ~S_DAX;
+ }
/*
* If the device is invalidated, rescan partition
@@ -1222,6 +1254,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
else if (ret == -ENOMEDIUM)
invalidate_partitions(disk, bdev);
}
+
if (ret)
goto out_clear;
} else {
@@ -1242,12 +1275,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
- /*
- * If the partition is not aligned on a page
- * boundary, we can't do dax I/O to it.
- */
- if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
- (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+ if (!blkdev_dax_capable(bdev))
bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
@@ -1500,11 +1528,14 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
WARN_ON_ONCE(bdev->bd_holders);
sync_blockdev(bdev);
kill_bdev(bdev);
+
+ bdev_write_inode(bdev);
/*
- * ->release can cause the queue to disappear, so flush all
- * dirty data before.
+ * Detaching bdev inode from its wb in __destroy_inode()
+ * is too late: the queue which embeds its bdi (along with
+ * root wb) can be gone as soon as we put_disk() below.
*/
- bdev_write_inode(bdev->bd_inode);
+ inode_detach_wb(bdev->bd_inode);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
@@ -1579,14 +1610,14 @@ EXPORT_SYMBOL(blkdev_put);
static int blkdev_close(struct inode * inode, struct file * filp)
{
- struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+ struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
blkdev_put(bdev, filp->f_mode);
return 0;
}
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
- struct block_device *bdev = I_BDEV(file->f_mapping->host);
+ struct block_device *bdev = I_BDEV(bdev_file_inode(file));
fmode_t mode = file->f_mode;
/*
@@ -1611,7 +1642,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
struct blk_plug plug;
ssize_t ret;
@@ -1643,7 +1674,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter);
ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
- struct inode *bd_inode = file->f_mapping->host;
+ struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
loff_t pos = iocb->ki_pos;
@@ -1670,25 +1701,102 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ if (dax_mapping(mapping)) {
+ struct block_device *bdev = I_BDEV(mapping->host);
+
+ return dax_writeback_mapping_range(mapping, bdev, wbc);
+ }
+ return generic_writepages(mapping, wbc);
+}
+
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
.readpages = blkdev_readpages,
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
- .writepages = generic_writepages,
+ .writepages = blkdev_writepages,
.releasepage = blkdev_releasepage,
.direct_IO = blkdev_direct_IO,
.is_dirty_writeback = buffer_check_dirty_writeback,
};
+#ifdef CONFIG_FS_DAX
+/*
+ * In the raw block case we do not need to contend with truncation nor
+ * unwritten file extents. Without those concerns there is no need for
+ * additional locking beyond the mmap_sem context that these routines
+ * are already executing under.
+ *
+ * Note, there is no protection if the block device is dynamically
+ * resized (partition grow/shrink) during a fault. A stable block device
+ * size is already not enforced in the blkdev_direct_IO path.
+ *
+ * For DAX, it is the responsibility of the block device driver to
+ * ensure the whole-disk device size is stable while requests are in
+ * flight.
+ *
+ * Finally, unlike the filemap_page_mkwrite() case there is no
+ * filesystem superblock to sync against freezing. We still include a
+ * pfn_mkwrite callback for dax drivers to receive write fault
+ * notifications.
+ */
+static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return __dax_fault(vma, vmf, blkdev_get_block, NULL);
+}
+
+static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ return dax_pfn_mkwrite(vma, vmf);
+}
+
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
+}
+
+static const struct vm_operations_struct blkdev_dax_vm_ops = {
+ .fault = blkdev_dax_fault,
+ .pmd_fault = blkdev_dax_pmd_fault,
+ .pfn_mkwrite = blkdev_dax_pfn_mkwrite,
+};
+
+static const struct vm_operations_struct blkdev_default_vm_ops = {
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+};
+
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(file);
+
+ file_accessed(file);
+ if (IS_DAX(bd_inode)) {
+ vma->vm_ops = &blkdev_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+ } else {
+ vma->vm_ops = &blkdev_default_vm_ops;
+ }
+
+ return 0;
+}
+#else
+#define blkdev_mmap generic_file_mmap
+#endif
+
const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
- .mmap = generic_file_mmap,
+ .mmap = blkdev_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6d1d0b93b1aa..128ce17a80b0 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,11 +9,12 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o hash.o
+ uuid-tree.o props.o hash.o free-space-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
- tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
+ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
+ tests/free-space-tree-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9a0124a95851..6d263bb1621c 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,10 +37,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -48,7 +48,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
size = __btrfs_getxattr(inode, name, "", 0);
if (size > 0) {
- value = kzalloc(size, GFP_NOFS);
+ value = kzalloc(size, GFP_KERNEL);
if (!value)
return ERR_PTR(-ENOMEM);
size = __btrfs_getxattr(inode, name, value, size);
@@ -81,7 +81,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
ret = posix_acl_equiv_mode(acl, &inode->i_mode);
if (ret < 0)
@@ -94,7 +94,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
case ACL_TYPE_DEFAULT:
if (!S_ISDIR(inode->i_mode))
return acl ? -EINVAL : 0;
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return -EINVAL;
@@ -102,7 +102,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
if (acl) {
size = posix_acl_xattr_size(acl->a_count);
- value = kmalloc(size, GFP_NOFS);
+ value = kmalloc(size, GFP_KERNEL);
if (!value) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 3e36e4adc4a3..5fb60ea7eee2 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -97,7 +97,7 @@ static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
int thresh)
{
- struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+ struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
@@ -148,7 +148,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
int limit_active,
int thresh)
{
- struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
@@ -328,8 +328,8 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
list_add_tail(&work->ordered_list, &wq->ordered_list);
spin_unlock_irqrestore(&wq->list_lock, flags);
}
- queue_work(wq->normal_wq, &work->normal_work);
trace_btrfs_work_queued(work);
+ queue_work(wq->normal_wq, &work->normal_work);
}
void btrfs_queue_work(struct btrfs_workqueue *wq,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6dcdb2ec9211..80e8472d618b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -148,8 +148,7 @@ int __init btrfs_prelim_ref_init(void)
void btrfs_prelim_ref_exit(void)
{
- if (btrfs_prelim_ref_cache)
- kmem_cache_destroy(btrfs_prelim_ref_cache);
+ kmem_cache_destroy(btrfs_prelim_ref_cache);
}
/*
@@ -355,7 +354,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
index = srcu_read_lock(&fs_info->subvol_srcu);
- root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ root = btrfs_get_fs_root(fs_info, &root_key, false);
if (IS_ERR(root)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(root);
@@ -520,13 +519,10 @@ static inline int ref_for_same_block(struct __prelim_ref *ref1,
static int __add_missing_keys(struct btrfs_fs_info *fs_info,
struct list_head *head)
{
- struct list_head *pos;
+ struct __prelim_ref *ref;
struct extent_buffer *eb;
- list_for_each(pos, head) {
- struct __prelim_ref *ref;
- ref = list_entry(pos, struct __prelim_ref, list);
-
+ list_for_each_entry(ref, head, list) {
if (ref->parent)
continue;
if (ref->key_for_search.type)
@@ -563,31 +559,20 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
*/
static void __merge_refs(struct list_head *head, int mode)
{
- struct list_head *pos1;
+ struct __prelim_ref *pos1;
- list_for_each(pos1, head) {
- struct list_head *n2;
- struct list_head *pos2;
- struct __prelim_ref *ref1;
+ list_for_each_entry(pos1, head, list) {
+ struct __prelim_ref *pos2 = pos1, *tmp;
- ref1 = list_entry(pos1, struct __prelim_ref, list);
-
- for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
- pos2 = n2, n2 = pos2->next) {
- struct __prelim_ref *ref2;
- struct __prelim_ref *xchg;
+ list_for_each_entry_safe_continue(pos2, tmp, head, list) {
+ struct __prelim_ref *ref1 = pos1, *ref2 = pos2;
struct extent_inode_elem *eie;
- ref2 = list_entry(pos2, struct __prelim_ref, list);
-
if (!ref_for_same_block(ref1, ref2))
continue;
if (mode == 1) {
- if (!ref1->parent && ref2->parent) {
- xchg = ref1;
- ref1 = ref2;
- ref2 = xchg;
- }
+ if (!ref1->parent && ref2->parent)
+ swap(ref1, ref2);
} else {
if (ref1->parent != ref2->parent)
continue;
@@ -1417,7 +1402,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
read_extent_buffer(eb, dest + bytes_left,
name_off, name_len);
if (eb != eb_in) {
- btrfs_tree_read_unlock_blocking(eb);
+ if (!path->skip_locking)
+ btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
}
ret = btrfs_find_item(fs_root, path, parent, 0,
@@ -1437,9 +1423,10 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
eb = path->nodes[0];
/* make sure we can use eb after releasing the path */
if (eb != eb_in) {
- atomic_inc(&eb->refs);
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ if (!path->skip_locking)
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ path->nodes[0] = NULL;
+ path->locks[0] = 0;
}
btrfs_release_path(path);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0ef5cc13fae2..61205e3bbefa 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -192,6 +192,10 @@ struct btrfs_inode {
/* File creation time. */
struct timespec i_otime;
+ /* Hook into fs_info->delayed_iputs */
+ struct list_head delayed_iput;
+ long delayed_iput_count;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 0340c57bf377..516e19d1d202 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -95,6 +95,7 @@
#include <linux/genhd.h>
#include <linux/blkdev.h>
#include <linux/vmalloc.h>
+#include <linux/string.h>
#include "ctree.h"
#include "disk-io.h"
#include "hash.h"
@@ -105,6 +106,7 @@
#include "locking.h"
#include "check-integrity.h"
#include "rcu-string.h"
+#include "compression.h"
#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
@@ -176,7 +178,7 @@ struct btrfsic_block {
* Elements of this type are allocated dynamically and required because
* each block object can refer to and can be ref from multiple blocks.
* The key to lookup them in the hashtable is the dev_bytenr of
- * the block ref to plus the one from the block refered from.
+ * the block ref to plus the one from the block referred from.
* The fact that they are searchable via a hashtable and that a
* ref_cnt is maintained is not required for the btrfs integrity
* check algorithm itself, it is only used to make the output more
@@ -531,13 +533,9 @@ static struct btrfsic_block *btrfsic_block_hashtable_lookup(
(((unsigned int)(dev_bytenr >> 16)) ^
((unsigned int)((uintptr_t)bdev))) &
(BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_block *const b =
- list_entry(elem, struct btrfsic_block,
- collision_resolving_node);
+ struct btrfsic_block *b;
+ list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
return b;
}
@@ -588,13 +586,9 @@ static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
((unsigned int)((uintptr_t)bdev_ref_to)) ^
((unsigned int)((uintptr_t)bdev_ref_from))) &
(BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_block_link *const l =
- list_entry(elem, struct btrfsic_block_link,
- collision_resolving_node);
+ struct btrfsic_block_link *l;
+ list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
BUG_ON(NULL == l->block_ref_to);
BUG_ON(NULL == l->block_ref_from);
if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
@@ -639,13 +633,9 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
const unsigned int hashval =
(((unsigned int)((uintptr_t)bdev)) &
(BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
- struct list_head *elem;
-
- list_for_each(elem, h->table + hashval) {
- struct btrfsic_dev_state *const ds =
- list_entry(elem, struct btrfsic_dev_state,
- collision_resolving_node);
+ struct btrfsic_dev_state *ds;
+ list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
if (ds->bdev == bdev)
return ds;
}
@@ -767,7 +757,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
BUG_ON(NULL == l);
ret = btrfsic_read_block(state, &tmp_next_block_ctx);
- if (ret < (int)PAGE_CACHE_SIZE) {
+ if (ret < (int)PAGE_SIZE) {
printk(KERN_INFO
"btrfsic: read @logical %llu failed!\n",
tmp_next_block_ctx.start);
@@ -1241,15 +1231,15 @@ static void btrfsic_read_from_block_data(
size_t offset_in_page;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
WARN_ON(offset + len > block_ctx->len);
- offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1);
+ offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1);
while (len > 0) {
- cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
- BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
+ cur = min(len, ((size_t)PAGE_SIZE - offset_in_page));
+ BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
kaddr = block_ctx->datav[i];
memcpy(dst, kaddr + offset_in_page, cur);
@@ -1615,8 +1605,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
BUG_ON(!block_ctx->datav);
BUG_ON(!block_ctx->pagev);
- num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
while (num_pages > 0) {
num_pages--;
if (block_ctx->datav[num_pages]) {
@@ -1647,15 +1637,15 @@ static int btrfsic_read_block(struct btrfsic_state *state,
BUG_ON(block_ctx->datav);
BUG_ON(block_ctx->pagev);
BUG_ON(block_ctx->mem_to_free);
- if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
+ if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) {
printk(KERN_INFO
"btrfsic: read_block() with unaligned bytenr %llu\n",
block_ctx->dev_bytenr);
return -1;
}
- num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
sizeof(*block_ctx->pagev)) *
num_pages, GFP_NOFS);
@@ -1686,8 +1676,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
for (j = i; j < num_pages; j++) {
ret = bio_add_page(bio, block_ctx->pagev[j],
- PAGE_CACHE_SIZE, 0);
- if (PAGE_CACHE_SIZE != ret)
+ PAGE_SIZE, 0);
+ if (PAGE_SIZE != ret)
break;
}
if (j == i) {
@@ -1703,7 +1693,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -1;
}
bio_put(bio);
- dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+ dev_bytenr += (j - i) * PAGE_SIZE;
i = j;
}
for (i = 0; i < num_pages; i++) {
@@ -1720,29 +1710,20 @@ static int btrfsic_read_block(struct btrfsic_state *state,
static void btrfsic_dump_database(struct btrfsic_state *state)
{
- struct list_head *elem_all;
+ const struct btrfsic_block *b_all;
BUG_ON(NULL == state);
printk(KERN_INFO "all_blocks_list:\n");
- list_for_each(elem_all, &state->all_blocks_list) {
- const struct btrfsic_block *const b_all =
- list_entry(elem_all, struct btrfsic_block,
- all_blocks_node);
- struct list_head *elem_ref_to;
- struct list_head *elem_ref_from;
+ list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
+ const struct btrfsic_block_link *l;
printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
btrfsic_get_block_type(state, b_all),
b_all->logical_bytenr, b_all->dev_state->name,
b_all->dev_bytenr, b_all->mirror_num);
- list_for_each(elem_ref_to, &b_all->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
" refers %u* to"
" %c @%llu (%s/%llu/%d)\n",
@@ -1757,12 +1738,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
l->block_ref_to->mirror_num);
}
- list_for_each(elem_ref_from, &b_all->ref_from_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_from,
- struct btrfsic_block_link,
- node_ref_from);
-
+ list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
" is ref %u* from"
" %c @%llu (%s/%llu/%d)\n",
@@ -1793,9 +1769,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
u32 crc = ~(u32)0;
unsigned int i;
- if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+ if (num_pages * PAGE_SIZE < state->metablock_size)
return 1; /* not metadata */
- num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+ num_pages = state->metablock_size >> PAGE_SHIFT;
h = (struct btrfs_header *)datav[0];
if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1803,8 +1779,8 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
for (i = 0; i < num_pages; i++) {
u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
- size_t sublen = i ? PAGE_CACHE_SIZE :
- (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
+ size_t sublen = i ? PAGE_SIZE :
+ (PAGE_SIZE - BTRFS_CSUM_SIZE);
crc = btrfs_crc32c(crc, data, sublen);
}
@@ -1845,20 +1821,19 @@ again:
&state->block_hashtable);
if (NULL != block) {
u64 bytenr = 0;
- struct list_head *elem_ref_to;
- struct list_head *tmp_ref_to;
+ struct btrfsic_block_link *l, *tmp;
if (block->is_superblock) {
bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
mapped_datav[0]);
- if (num_pages * PAGE_CACHE_SIZE <
+ if (num_pages * PAGE_SIZE <
BTRFS_SUPER_INFO_SIZE) {
printk(KERN_INFO
"btrfsic: cannot work with too short bios!\n");
return;
}
is_metadata = 1;
- BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+ BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1));
processed_len = BTRFS_SUPER_INFO_SIZE;
if (state->print_mask &
BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
@@ -1869,7 +1844,7 @@ again:
}
if (is_metadata) {
if (!block->is_superblock) {
- if (num_pages * PAGE_CACHE_SIZE <
+ if (num_pages * PAGE_SIZE <
state->metablock_size) {
printk(KERN_INFO
"btrfsic: cannot work with too short bios!\n");
@@ -1905,7 +1880,7 @@ again:
}
block->logical_bytenr = bytenr;
} else {
- if (num_pages * PAGE_CACHE_SIZE <
+ if (num_pages * PAGE_SIZE <
state->datablock_size) {
printk(KERN_INFO
"btrfsic: cannot work with too short bios!\n");
@@ -1967,13 +1942,8 @@ again:
* because it still carries valueable information
* like whether it was ever written and IO completed.
*/
- list_for_each_safe(elem_ref_to, tmp_ref_to,
- &block->ref_to_list) {
- struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry_safe(l, tmp, &block->ref_to_list,
+ node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
btrfsic_print_rem_link(state, l);
l->ref_cnt--;
@@ -2043,7 +2013,7 @@ again:
block->logical_bytenr = bytenr;
block->is_metadata = 1;
if (block->is_superblock) {
- BUG_ON(PAGE_CACHE_SIZE !=
+ BUG_ON(PAGE_SIZE !=
BTRFS_SUPER_INFO_SIZE);
ret = btrfsic_process_written_superblock(
state,
@@ -2202,8 +2172,8 @@ again:
continue_loop:
BUG_ON(!processed_len);
dev_bytenr += processed_len;
- mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
- num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+ mapped_datav += processed_len >> PAGE_SHIFT;
+ num_pages -= processed_len >> PAGE_SHIFT;
goto again;
}
@@ -2436,7 +2406,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
struct btrfsic_block *const block,
int recursion_level)
{
- struct list_head *elem_ref_to;
+ const struct btrfsic_block_link *l;
int ret = 0;
if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
@@ -2464,11 +2434,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
* This algorithm is recursive because the amount of used stack
* space is very small and the max recursion depth is limited.
*/
- list_for_each(elem_ref_to, &block->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to, struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"rl=%d, %c @%llu (%s/%llu/%d)"
@@ -2561,7 +2527,7 @@ static int btrfsic_is_block_ref_by_superblock(
const struct btrfsic_block *block,
int recursion_level)
{
- struct list_head *elem_ref_from;
+ const struct btrfsic_block_link *l;
if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
/* refer to comment at "abort cyclic linkage (case 1)" */
@@ -2576,11 +2542,7 @@ static int btrfsic_is_block_ref_by_superblock(
* This algorithm is recursive because the amount of used stack space
* is very small and the max recursion depth is limited.
*/
- list_for_each(elem_ref_from, &block->ref_from_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_from, struct btrfsic_block_link,
- node_ref_from);
-
+ list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"rl=%d, %c @%llu (%s/%llu/%d)"
@@ -2669,7 +2631,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
const struct btrfsic_block *block,
int indent_level)
{
- struct list_head *elem_ref_to;
+ const struct btrfsic_block_link *l;
int indent_add;
static char buf[80];
int cursor_position;
@@ -2704,11 +2666,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
}
cursor_position = indent_level;
- list_for_each(elem_ref_to, &block->ref_to_list) {
- const struct btrfsic_block_link *const l =
- list_entry(elem_ref_to, struct btrfsic_block_link,
- node_ref_to);
-
+ list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
while (cursor_position < indent_level) {
printk(" ");
cursor_position++;
@@ -2996,7 +2954,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
goto leave;
cur_bytenr = dev_bytenr;
for (i = 0; i < bio->bi_vcnt; i++) {
- BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+ BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_SIZE);
mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
if (!mapped_datav[i]) {
while (i > 0) {
@@ -3079,16 +3037,16 @@ int btrfsic_mount(struct btrfs_root *root,
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
- if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ if (root->nodesize & ((u64)PAGE_SIZE - 1)) {
printk(KERN_INFO
- "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
- root->nodesize, PAGE_CACHE_SIZE);
+ "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n",
+ root->nodesize, PAGE_SIZE);
return -1;
}
- if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+ if (root->sectorsize & ((u64)PAGE_SIZE - 1)) {
printk(KERN_INFO
- "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
- root->sectorsize, PAGE_CACHE_SIZE);
+ "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n",
+ root->sectorsize, PAGE_SIZE);
return -1;
}
state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
@@ -3120,7 +3078,7 @@ int btrfsic_mount(struct btrfs_root *root,
list_for_each_entry(device, dev_head, dev_list) {
struct btrfsic_dev_state *ds;
- char *p;
+ const char *p;
if (!device->bdev || !device->name)
continue;
@@ -3136,11 +3094,7 @@ int btrfsic_mount(struct btrfs_root *root,
ds->state = state;
bdevname(ds->bdev, ds->name);
ds->name[BDEVNAME_SIZE - 1] = '\0';
- for (p = ds->name; *p != '\0'; p++);
- while (p > ds->name && *p != '/')
- p--;
- if (*p == '/')
- p++;
+ p = kbasename(ds->name);
strlcpy(ds->name, p, sizeof(ds->name));
btrfsic_dev_state_hashtable_add(ds,
&btrfsic_dev_state_hashtable);
@@ -3165,8 +3119,7 @@ int btrfsic_mount(struct btrfs_root *root,
void btrfsic_unmount(struct btrfs_root *root,
struct btrfs_fs_devices *fs_devices)
{
- struct list_head *elem_all;
- struct list_head *tmp_all;
+ struct btrfsic_block *b_all, *tmp_all;
struct btrfsic_state *state;
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
@@ -3206,20 +3159,12 @@ void btrfsic_unmount(struct btrfs_root *root,
* just free all memory that was allocated dynamically.
* Free the blocks and the block_links.
*/
- list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
- struct btrfsic_block *const b_all =
- list_entry(elem_all, struct btrfsic_block,
- all_blocks_node);
- struct list_head *elem_ref_to;
- struct list_head *tmp_ref_to;
-
- list_for_each_safe(elem_ref_to, tmp_ref_to,
- &b_all->ref_to_list) {
- struct btrfsic_block_link *const l =
- list_entry(elem_ref_to,
- struct btrfsic_block_link,
- node_ref_to);
+ list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
+ all_blocks_node) {
+ struct btrfsic_block_link *l, *tmp;
+ list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
+ node_ref_to) {
if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
btrfsic_print_rem_link(state, l);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 97b049ad0594..ff61a41ac90b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -119,7 +119,7 @@ static int check_compressed_csum(struct inode *inode,
csum = ~(u32)0;
kaddr = kmap_atomic(page);
- csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE);
+ csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE);
btrfs_csum_final(csum, (char *)&csum);
kunmap_atomic(kaddr);
@@ -190,7 +190,7 @@ csum_failed:
for (index = 0; index < cb->nr_pages; index++) {
page = cb->compressed_pages[index];
page->mapping = NULL;
- page_cache_release(page);
+ put_page(page);
}
/* do io completion on the original bio */
@@ -224,8 +224,8 @@ out:
static noinline void end_compressed_writeback(struct inode *inode,
const struct compressed_bio *cb)
{
- unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long index = cb->start >> PAGE_SHIFT;
+ unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
int i;
@@ -247,7 +247,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
if (cb->errors)
SetPageError(pages[i]);
end_page_writeback(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
nr_pages -= ret;
index += ret;
@@ -304,7 +304,7 @@ static void end_compressed_bio_write(struct bio *bio)
for (index = 0; index < cb->nr_pages; index++) {
page = cb->compressed_pages[index];
page->mapping = NULL;
- page_cache_release(page);
+ put_page(page);
}
/* finally free the cb struct */
@@ -341,7 +341,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
int ret;
int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+ WARN_ON(start & ((u64)PAGE_SIZE - 1));
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
if (!cb)
return -ENOMEM;
@@ -374,14 +374,14 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
page->mapping = inode->i_mapping;
if (bio->bi_iter.bi_size)
ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
- PAGE_CACHE_SIZE,
+ PAGE_SIZE,
bio, 0);
else
ret = 0;
page->mapping = NULL;
- if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
- PAGE_CACHE_SIZE) {
+ if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) <
+ PAGE_SIZE) {
bio_get(bio);
/*
@@ -410,15 +410,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
BUG_ON(!bio);
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
- bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ bio_add_page(bio, page, PAGE_SIZE, 0);
}
- if (bytes_left < PAGE_CACHE_SIZE) {
+ if (bytes_left < PAGE_SIZE) {
btrfs_info(BTRFS_I(inode)->root->fs_info,
"bytes left %lu compress len %lu nr %lu",
bytes_left, cb->compressed_len, cb->nr_pages);
}
- bytes_left -= PAGE_CACHE_SIZE;
- first_byte += PAGE_CACHE_SIZE;
+ bytes_left -= PAGE_SIZE;
+ first_byte += PAGE_SIZE;
cond_resched();
}
bio_get(bio);
@@ -457,17 +457,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
int misses = 0;
page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
- last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+ last_offset = (page_offset(page) + PAGE_SIZE);
em_tree = &BTRFS_I(inode)->extent_tree;
tree = &BTRFS_I(inode)->io_tree;
if (isize == 0)
return 0;
- end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
while (last_offset < compressed_end) {
- pg_index = last_offset >> PAGE_CACHE_SHIFT;
+ pg_index = last_offset >> PAGE_SHIFT;
if (pg_index > end_index)
break;
@@ -482,18 +482,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
goto next;
}
- page = __page_cache_alloc(mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ page = __page_cache_alloc(mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
break;
- if (add_to_page_cache_lru(page, mapping, pg_index,
- GFP_NOFS)) {
- page_cache_release(page);
+ if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
+ put_page(page);
goto next;
}
- end = last_offset + PAGE_CACHE_SIZE - 1;
+ end = last_offset + PAGE_SIZE - 1;
/*
* at this point, we have a locked page in the page cache
* for these bytes in the file. But, we have to make
@@ -503,27 +502,27 @@ static noinline int add_ra_bio_pages(struct inode *inode,
lock_extent(tree, last_offset, end);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
read_unlock(&em_tree->lock);
if (!em || last_offset < em->start ||
- (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+ (last_offset + PAGE_SIZE > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, last_offset, end);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
break;
}
free_extent_map(em);
if (page->index == end_index) {
char *userpage;
- size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+ size_t zero_offset = isize & (PAGE_SIZE - 1);
if (zero_offset) {
int zeros;
- zeros = PAGE_CACHE_SIZE - zero_offset;
+ zeros = PAGE_SIZE - zero_offset;
userpage = kmap_atomic(page);
memset(userpage + zero_offset, 0, zeros);
flush_dcache_page(page);
@@ -532,19 +531,19 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
ret = bio_add_page(cb->orig_bio, page,
- PAGE_CACHE_SIZE, 0);
+ PAGE_SIZE, 0);
- if (ret == PAGE_CACHE_SIZE) {
+ if (ret == PAGE_SIZE) {
nr_pages++;
- page_cache_release(page);
+ put_page(page);
} else {
unlock_extent(tree, last_offset, end);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
break;
}
next:
- last_offset += PAGE_CACHE_SIZE;
+ last_offset += PAGE_SIZE;
}
return 0;
}
@@ -568,7 +567,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
struct btrfs_root *root = BTRFS_I(inode)->root;
- unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ unsigned long uncompressed_len = bio->bi_vcnt * PAGE_SIZE;
unsigned long compressed_len;
unsigned long nr_pages;
unsigned long pg_index;
@@ -590,7 +589,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree,
page_offset(bio->bi_io_vec->bv_page),
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
read_unlock(&em_tree->lock);
if (!em)
return -EIO;
@@ -618,7 +617,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
cb->compress_type = extent_compress_type(bio_flags);
cb->orig_bio = bio;
- nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
+ nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
GFP_NOFS);
if (!cb->compressed_pages)
@@ -638,14 +637,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
faili = nr_pages - 1;
cb->nr_pages = nr_pages;
- /* In the parent-locked case, we only locked the range we are
- * interested in. In all other cases, we can opportunistically
- * cache decompressed data that goes beyond the requested range. */
- if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
- add_ra_bio_pages(inode, em_start + em_len, cb);
+ add_ra_bio_pages(inode, em_start + em_len, cb);
/* include any pages we added in add_ra-bio_pages */
- uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+ uncompressed_len = bio->bi_vcnt * PAGE_SIZE;
cb->len = uncompressed_len;
comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
@@ -658,18 +653,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
page = cb->compressed_pages[pg_index];
page->mapping = inode->i_mapping;
- page->index = em_start >> PAGE_CACHE_SHIFT;
+ page->index = em_start >> PAGE_SHIFT;
if (comp_bio->bi_iter.bi_size)
ret = tree->ops->merge_bio_hook(READ, page, 0,
- PAGE_CACHE_SIZE,
+ PAGE_SIZE,
comp_bio, 0);
else
ret = 0;
page->mapping = NULL;
- if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
- PAGE_CACHE_SIZE) {
+ if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
+ PAGE_SIZE) {
bio_get(comp_bio);
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
@@ -707,9 +702,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio->bi_private = cb;
comp_bio->bi_end_io = end_compressed_bio_read;
- bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+ bio_add_page(comp_bio, page, PAGE_SIZE, 0);
}
- cur_disk_byte += PAGE_CACHE_SIZE;
+ cur_disk_byte += PAGE_SIZE;
}
bio_get(comp_bio);
@@ -1018,8 +1013,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
/* copy bytes from the working buffer into the pages */
while (working_bytes > 0) {
- bytes = min(PAGE_CACHE_SIZE - *pg_offset,
- PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(PAGE_SIZE - *pg_offset,
+ PAGE_SIZE - buf_offset);
bytes = min(bytes, working_bytes);
kaddr = kmap_atomic(page_out);
memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
@@ -1032,7 +1027,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
current_buf_start += bytes;
/* check if we need to pick another page */
- if (*pg_offset == PAGE_CACHE_SIZE) {
+ if (*pg_offset == PAGE_SIZE) {
(*pg_index)++;
if (*pg_index >= vcnt)
return 0;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 13a4dc0436c9..f49d8b8c0f00 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -48,6 +48,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
unsigned long pg_index,
unsigned long pg_offset);
+
+enum btrfs_compression_type {
+ BTRFS_COMPRESS_NONE = 0,
+ BTRFS_COMPRESS_ZLIB = 1,
+ BTRFS_COMPRESS_LZO = 2,
+ BTRFS_COMPRESS_TYPES = 2,
+ BTRFS_COMPRESS_LAST = 3,
+};
+
struct btrfs_compress_op {
struct list_head *(*alloc_workspace)(void);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5b8e235c4b6d..ec7928a27aaa 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
+#include <linux/vmalloc.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -311,7 +312,7 @@ struct tree_mod_root {
struct tree_mod_elem {
struct rb_node node;
- u64 index; /* shifted logical */
+ u64 logical;
u64 seq;
enum mod_log_op op;
@@ -435,11 +436,11 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
/*
* key order of the log:
- * index -> sequence
+ * node/leaf start address -> sequence
*
- * the index is the shifted logical of the *new* root node for root replace
- * operations, or the shifted logical of the affected block for all other
- * operations.
+ * The 'start address' is the logical address of the *new* root node
+ * for root replace operations, or the logical address of the affected
+ * block for all other operations.
*
* Note: must be called with write lock (tree_mod_log_write_lock).
*/
@@ -460,9 +461,9 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
while (*new) {
cur = container_of(*new, struct tree_mod_elem, node);
parent = *new;
- if (cur->index < tm->index)
+ if (cur->logical < tm->logical)
new = &((*new)->rb_left);
- else if (cur->index > tm->index)
+ else if (cur->logical > tm->logical)
new = &((*new)->rb_right);
else if (cur->seq < tm->seq)
new = &((*new)->rb_left);
@@ -523,7 +524,7 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
if (!tm)
return NULL;
- tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ tm->logical = eb->start;
if (op != MOD_LOG_KEY_ADD) {
btrfs_node_key(eb, &tm->key, slot);
tm->blockptr = btrfs_node_blockptr(eb, slot);
@@ -588,7 +589,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
goto free_tms;
}
- tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ tm->logical = eb->start;
tm->slot = src_slot;
tm->move.dst_slot = dst_slot;
tm->move.nr_items = nr_items;
@@ -699,7 +700,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
goto free_tms;
}
- tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+ tm->logical = new_root->start;
tm->old_root.logical = old_root->start;
tm->old_root.level = btrfs_header_level(old_root);
tm->generation = btrfs_header_generation(old_root);
@@ -739,16 +740,15 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
struct rb_node *node;
struct tree_mod_elem *cur = NULL;
struct tree_mod_elem *found = NULL;
- u64 index = start >> PAGE_CACHE_SHIFT;
tree_mod_log_read_lock(fs_info);
tm_root = &fs_info->tree_mod_log;
node = tm_root->rb_node;
while (node) {
cur = container_of(node, struct tree_mod_elem, node);
- if (cur->index < index) {
+ if (cur->logical < start) {
node = node->rb_left;
- } else if (cur->index > index) {
+ } else if (cur->logical > start) {
node = node->rb_right;
} else if (cur->seq < min_seq) {
node = node->rb_left;
@@ -1230,9 +1230,10 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
return NULL;
/*
- * the very last operation that's logged for a root is the replacement
- * operation (if it is replaced at all). this has the index of the *new*
- * root, making it the very first operation that's logged for this root.
+ * the very last operation that's logged for a root is the
+ * replacement operation (if it is replaced at all). this has
+ * the logical address of the *new* root, making it the very
+ * first operation that's logged for this root.
*/
while (1) {
tm = tree_mod_log_search_oldest(fs_info, root_logical,
@@ -1336,7 +1337,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
if (!next)
break;
tm = container_of(next, struct tree_mod_elem, node);
- if (tm->index != first_tm->index)
+ if (tm->logical != first_tm->logical)
break;
}
tree_mod_log_read_unlock(fs_info);
@@ -1555,7 +1556,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
- search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+ search_start = buf->start & ~((u64)SZ_1G - 1);
if (parent)
btrfs_set_lock_blocking(parent);
@@ -2248,7 +2249,6 @@ static void reada_for_search(struct btrfs_root *root,
u64 target;
u64 nread = 0;
u64 gen;
- int direction = path->reada;
struct extent_buffer *eb;
u32 nr;
u32 blocksize;
@@ -2276,16 +2276,16 @@ static void reada_for_search(struct btrfs_root *root,
nr = slot;
while (1) {
- if (direction < 0) {
+ if (path->reada == READA_BACK) {
if (nr == 0)
break;
nr--;
- } else if (direction > 0) {
+ } else if (path->reada == READA_FORWARD) {
nr++;
if (nr >= nritems)
break;
}
- if (path->reada < 0 && objectid) {
+ if (path->reada == READA_BACK && objectid) {
btrfs_node_key(node, &disk_key, nr);
if (btrfs_disk_key_objectid(&disk_key) != objectid)
break;
@@ -2493,7 +2493,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
free_extent_buffer(tmp);
- if (p->reada)
+ if (p->reada != READA_NONE)
reada_for_search(root, p, level, slot, key->objectid);
btrfs_release_path(p);
@@ -5362,10 +5362,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
goto out;
}
- tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
+ tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN);
if (!tmp_buf) {
- ret = -ENOMEM;
- goto out;
+ tmp_buf = vmalloc(left_root->nodesize);
+ if (!tmp_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
left_path->search_commit_root = 1;
@@ -5566,7 +5569,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
out:
btrfs_free_path(left_path);
btrfs_free_path(right_path);
- kfree(tmp_buf);
+ kvfree(tmp_buf);
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a2e73f6053a8..84a6a5b3384a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
#include <linux/btrfs.h>
#include <linux/workqueue.h>
#include <linux/security.h>
+#include <linux/sizes.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -96,6 +97,12 @@ struct btrfs_ordered_sum;
/* for storing items that use the BTRFS_UUID_KEY* types */
#define BTRFS_UUID_TREE_OBJECTID 9ULL
+/* tracks free space in block groups. */
+#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
+
+/* device stats in the device tree */
+#define BTRFS_DEV_STATS_OBJECTID 0ULL
+
/* for storing balance parameters in the root tree */
#define BTRFS_BALANCE_OBJECTID -4ULL
@@ -174,7 +181,7 @@ struct btrfs_ordered_sum;
/* csum types */
#define BTRFS_CSUM_TYPE_CRC32 0
-static int btrfs_csum_sizes[] = { 4 };
+static const int btrfs_csum_sizes[] = { 4 };
/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
@@ -196,9 +203,9 @@ static int btrfs_csum_sizes[] = { 4 };
/* ioprio of readahead is set to idle */
#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
-#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+#define BTRFS_DIRTY_METADATA_THRESH SZ_32M
-#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+#define BTRFS_MAX_EXTENT_SIZE SZ_128M
/*
* The key defines the order in the tree, and so it also defines (optimal)
@@ -500,6 +507,8 @@ struct btrfs_super_block {
* Compat flags that we support. If any incompat flags are set other than the
* ones specified below then we will fail to mount
*/
+#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0)
+
#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
@@ -526,7 +535,10 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
-#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
+
+#define BTRFS_FEATURE_COMPAT_RO_SUPP \
+ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
+
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
@@ -590,14 +602,15 @@ struct btrfs_node {
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
+enum { READA_NONE = 0, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
/* if there is real range locking, this locks field will change */
- int locks[BTRFS_MAX_LEVEL];
- int reada;
+ u8 locks[BTRFS_MAX_LEVEL];
+ u8 reada;
/* keep some upper locks as we walk down */
- int lowest_level;
+ u8 lowest_level;
/*
* set by btrfs_split_item, tells search_slot to keep all locks
@@ -705,14 +718,6 @@ struct btrfs_timespec {
__le32 nsec;
} __attribute__ ((__packed__));
-enum btrfs_compression_type {
- BTRFS_COMPRESS_NONE = 0,
- BTRFS_COMPRESS_ZLIB = 1,
- BTRFS_COMPRESS_LZO = 2,
- BTRFS_COMPRESS_TYPES = 2,
- BTRFS_COMPRESS_LAST = 3,
-};
-
struct btrfs_inode_item {
/* nfs style generation number */
__le64 generation;
@@ -783,7 +788,7 @@ struct btrfs_root_item {
/*
* This generation number is used to test if the new fields are valid
- * and up to date while reading the root item. Everytime the root item
+ * and up to date while reading the root item. Every time the root item
* is written out, the "generation" field is copied into this field. If
* anyone ever mounted the fs with an older kernel, we will have
* mismatching generation values here and thus must invalidate the
@@ -992,8 +997,10 @@ struct btrfs_dev_replace {
pid_t lock_owner;
atomic_t nesting_level;
struct mutex lock_finishing_cancel_unmount;
- struct mutex lock_management_lock;
- struct mutex lock;
+ rwlock_t lock;
+ atomic_t read_locks;
+ atomic_t blocking_readers;
+ wait_queue_head_t read_lock_wq;
struct btrfs_scrub_progress scrub_progress;
};
@@ -1088,6 +1095,13 @@ struct btrfs_block_group_item {
__le64 flags;
} __attribute__ ((__packed__));
+struct btrfs_free_space_info {
+ __le32 extent_count;
+ __le32 flags;
+} __attribute__ ((__packed__));
+
+#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
+
#define BTRFS_QGROUP_LEVEL_SHIFT 48
static inline u64 btrfs_qgroup_level(u64 qgroupid)
{
@@ -1205,10 +1219,10 @@ struct btrfs_space_info {
* we've called update_block_group and dropped the bytes_used counter
* and increased the bytes_pinned counter. However this means that
* bytes_pinned does not reflect the bytes that will be pinned once the
- * delayed refs are flushed, so this counter is inc'ed everytime we call
- * btrfs_free_extent so it is a realtime count of what will be freed
- * once the transaction is committed. It will be zero'ed everytime the
- * transaction commits.
+ * delayed refs are flushed, so this counter is inc'ed every time we
+ * call btrfs_free_extent so it is a realtime count of what will be
+ * freed once the transaction is committed. It will be zero'ed every
+ * time the transaction commits.
*/
struct percpu_counter total_bytes_pinned;
@@ -1296,6 +1310,9 @@ struct btrfs_caching_control {
atomic_t count;
};
+/* Once caching_thread() finds this much free space, it will wake up waiters. */
+#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
+
struct btrfs_io_ctl {
void *cur, *orig;
struct page *page;
@@ -1321,8 +1338,20 @@ struct btrfs_block_group_cache {
u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
- u64 sectorsize;
u64 cache_generation;
+ u32 sectorsize;
+
+ /*
+ * If the free space extent count exceeds this number, convert the block
+ * group to bitmaps.
+ */
+ u32 bitmap_high_thresh;
+
+ /*
+ * If the free space extent count drops below this number, convert the
+ * block group back to extents.
+ */
+ u32 bitmap_low_thresh;
/*
* It is just used for the delayed data space allocation because
@@ -1378,6 +1407,15 @@ struct btrfs_block_group_cache {
struct list_head io_list;
struct btrfs_io_ctl io_ctl;
+
+ /* Lock for free space tree operations. */
+ struct mutex free_space_lock;
+
+ /*
+ * Does the block group need to be added to the free space tree?
+ * Protected by free_space_lock.
+ */
+ int needs_free_space;
};
/* delayed seq elem */
@@ -1429,6 +1467,7 @@ struct btrfs_fs_info {
struct btrfs_root *csum_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
+ struct btrfs_root *free_space_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -1572,7 +1611,7 @@ struct btrfs_fs_info {
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
- struct rw_semaphore delayed_iput_sem;
+ struct mutex cleaner_delayed_iput_mutex;
/* this protects tree_mod_seq_list */
spinlock_t tree_mod_seq_lock;
@@ -1780,6 +1819,9 @@ struct btrfs_fs_info {
spinlock_t reada_lock;
struct radix_tree_root reada_tree;
+ /* readahead works cnt */
+ atomic_t reada_works_cnt;
+
/* Extent buffer radix tree */
spinlock_t buffer_lock;
struct radix_tree_root buffer_radix;
@@ -1816,6 +1858,8 @@ struct btrfs_fs_info {
* and will be latter freed. Protected by fs_info->chunk_mutex.
*/
struct list_head pinned_chunks;
+
+ int creating_free_space_tree;
};
struct btrfs_subvolume_writers {
@@ -2092,6 +2136,27 @@ struct btrfs_ioctl_defrag_range_args {
*/
#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+/*
+ * Every block group is represented in the free space tree by a free space info
+ * item, which stores some accounting information. It is keyed on
+ * (block_group_start, FREE_SPACE_INFO, block_group_length).
+ */
+#define BTRFS_FREE_SPACE_INFO_KEY 198
+
+/*
+ * A free space extent tracks an extent of space that is free in a block group.
+ * It is keyed on (start, FREE_SPACE_EXTENT, length).
+ */
+#define BTRFS_FREE_SPACE_EXTENT_KEY 199
+
+/*
+ * When a block group becomes very fragmented, we convert it to use bitmaps
+ * instead of extents. A free space bitmap is keyed on
+ * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
+ * (length / sectorsize) bits.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_KEY 200
+
#define BTRFS_DEV_EXTENT_KEY 204
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
@@ -2120,13 +2185,43 @@ struct btrfs_ioctl_defrag_range_args {
*/
#define BTRFS_QGROUP_RELATION_KEY 246
+/*
+ * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
+ */
#define BTRFS_BALANCE_ITEM_KEY 248
/*
- * Persistantly stores the io stats in the device tree.
- * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ * The key type for tree items that are stored persistently, but do not need to
+ * exist for extended period of time. The items can exist in any tree.
+ *
+ * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
+ *
+ * Existing items:
+ *
+ * - balance status item
+ * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
*/
-#define BTRFS_DEV_STATS_KEY 249
+#define BTRFS_TEMPORARY_ITEM_KEY 248
+
+/*
+ * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
+ */
+#define BTRFS_DEV_STATS_KEY 249
+
+/*
+ * The key type for tree items that are stored persistently and usually exist
+ * for a long period, eg. filesystem lifetime. The item kinds can be status
+ * information, stats or preference values. The item can exist in any tree.
+ *
+ * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
+ *
+ * Existing items:
+ *
+ * - device statistics, store IO stats in the device tree, one key for all
+ * stats
+ * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
+ */
+#define BTRFS_PERSISTENT_ITEM_KEY 249
/*
* Persistantly stores the device replace state in the device tree.
@@ -2176,7 +2271,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
-#define BTRFS_MOUNT_RECOVERY (1 << 18)
+#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18)
#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
@@ -2184,9 +2279,11 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
+#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
+#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
-#define BTRFS_DEFAULT_MAX_INLINE (8192)
+#define BTRFS_DEFAULT_MAX_INLINE (2048)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2287,6 +2384,9 @@ struct btrfs_map_token {
unsigned long offset;
};
+#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
+ ((bytes) >> (fs_info)->sb->s_blocksize_bits)
+
static inline void btrfs_init_map_token (struct btrfs_map_token *token)
{
token->kaddr = NULL;
@@ -2506,6 +2606,11 @@ BTRFS_SETGET_FUNCS(disk_block_group_flags,
BTRFS_SETGET_STACK_FUNCS(block_group_flags,
struct btrfs_block_group_item, flags, 64);
+/* struct btrfs_free_space_info */
+BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+ extent_count, 32);
+BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+
/* struct btrfs_inode_ref */
BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
@@ -3367,7 +3472,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
- return mapping_gfp_mask(mapping) & ~__GFP_FS;
+ return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
/* extent-tree.c */
@@ -3377,8 +3482,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
unsigned num_items)
{
- return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
- 2 * num_items;
+ return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
}
/*
@@ -3416,6 +3520,7 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *btrfs_lookup_block_group(
struct btrfs_fs_info *info,
u64 bytenr);
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int get_block_group_index(struct btrfs_block_group_cache *cache);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -3479,6 +3584,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
u64 size);
+struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
+ struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start,
struct extent_map *em);
@@ -3566,9 +3674,13 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
int __get_raid_index(u64 flags);
int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
void check_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const u64 type);
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end);
+
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3733,6 +3845,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->csum_root);
kfree(fs_info->quota_root);
kfree(fs_info->uuid_root);
+ kfree(fs_info->free_space_root);
kfree(fs_info->super_copy);
kfree(fs_info->super_for_commit);
security_free_mnt_opts(&fs_info->security_opts);
@@ -3902,7 +4015,6 @@ void btrfs_extent_item_to_extent_map(struct inode *inode,
/* inode.c */
struct btrfs_delalloc_work {
struct inode *inode;
- int wait;
int delay_iput;
struct completion completion;
struct list_head list;
@@ -3910,7 +4022,7 @@ struct btrfs_delalloc_work {
};
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int wait, int delay_iput);
+ int delay_iput);
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
@@ -3948,7 +4060,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir, u64 objectid,
const char *name, int name_len);
-int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
int front);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -4010,6 +4122,7 @@ void btrfs_test_inode_set_ops(struct inode *inode);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_update_iflags(struct inode *inode);
void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
int btrfs_is_empty_uuid(u8 *uuid);
@@ -4020,7 +4133,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
struct btrfs_ioctl_balance_args *bargs);
-
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff);
/* file.c */
int btrfs_auto_defrag_init(void);
@@ -4051,6 +4165,11 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
loff_t pos, size_t write_bytes,
struct extent_state **cached);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags);
+int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -4066,7 +4185,8 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
/* super.c */
-int btrfs_parse_options(struct btrfs_root *root, char *options);
+int btrfs_parse_options(struct btrfs_root *root, char *options,
+ unsigned long new_flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
#ifdef CONFIG_PRINTK
@@ -4243,16 +4363,98 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
}
}
+#define btrfs_clear_fs_incompat(__fs_info, opt) \
+ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_incompat_flags(disk_super, features);
+ btrfs_info(fs_info, "clearing %llu feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
#define btrfs_fs_incompat(fs_info, opt) \
__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
-static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
{
struct btrfs_super_block *disk_super;
disk_super = fs_info->super_copy;
return !!(btrfs_super_incompat_flags(disk_super) & flag);
}
+#define btrfs_set_fs_compat_ro(__fs_info, opt) \
+ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (!(features & flag)) {
+ features |= flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info, "setting %llu ro feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
+ u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ u64 features;
+
+ disk_super = fs_info->super_copy;
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ spin_lock(&fs_info->super_lock);
+ features = btrfs_super_compat_ro_flags(disk_super);
+ if (features & flag) {
+ features &= ~flag;
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ btrfs_info(fs_info, "clearing %llu ro feature flag",
+ flag);
+ }
+ spin_unlock(&fs_info->super_lock);
+ }
+}
+
+#define btrfs_fs_compat_ro(fs_info, opt) \
+ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+{
+ struct btrfs_super_block *disk_super;
+ disk_super = fs_info->super_copy;
+ return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+}
+
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
@@ -4358,8 +4560,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
struct btrfs_key *start, struct btrfs_key *end);
int btrfs_reada_wait(void *handle);
void btrfs_reada_detach(void *handle);
-int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
- u64 start, int err);
+int btree_readahead_hook(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, u64 start, int err);
static inline int is_fstree(u64 rootid)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e0941fbb913c..6cef0062f929 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -43,8 +43,7 @@ int __init btrfs_delayed_inode_init(void)
void btrfs_delayed_inode_exit(void)
{
- if (delayed_node_cache)
- kmem_cache_destroy(delayed_node_cache);
+ kmem_cache_destroy(delayed_node_cache);
}
static inline void btrfs_init_delayed_node(
@@ -54,16 +53,11 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
atomic_set(&delayed_node->refs, 0);
- delayed_node->count = 0;
- delayed_node->flags = 0;
delayed_node->ins_root = RB_ROOT;
delayed_node->del_root = RB_ROOT;
mutex_init(&delayed_node->mutex);
- delayed_node->index_cnt = 0;
INIT_LIST_HEAD(&delayed_node->n_list);
INIT_LIST_HEAD(&delayed_node->p_list);
- delayed_node->bytes_reserved = 0;
- memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
}
static inline int btrfs_is_continuous_delayed_item(
@@ -132,7 +126,7 @@ again:
if (node)
return node;
- node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
if (!node)
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
@@ -656,9 +650,14 @@ static int btrfs_delayed_inode_reserve_metadata(
goto out;
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
- if (!WARN_ON(ret))
+ if (!ret)
goto out;
+ if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ btrfs_debug(root->fs_info,
+ "block rsv migrate returned %d", ret);
+ WARN_ON(1);
+ }
/*
* Ok this is a problem, let's just steal from the global rsv
* since this really shouldn't happen that often.
@@ -1694,7 +1693,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
*
*/
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list)
+ struct list_head *ins_list, bool *emitted)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
@@ -1738,6 +1737,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
if (over)
return 1;
+ *emitted = true;
}
return 0;
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f70119f25421..0167853c84ae 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -144,7 +144,7 @@ void btrfs_put_delayed_items(struct list_head *ins_list,
int btrfs_should_delete_dir_index(struct list_head *del_list,
u64 index);
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- struct list_head *ins_list);
+ struct list_head *ins_list, bool *emitted);
/* for init */
int __init btrfs_delayed_inode_init(void);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e06dd75ad13f..430b3689b112 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -493,12 +493,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
memcpy(&existing_ref->extent_op->key,
&ref->extent_op->key,
sizeof(ref->extent_op->key));
- existing_ref->extent_op->update_key = 1;
+ existing_ref->extent_op->update_key = true;
}
if (ref->extent_op->update_flags) {
existing_ref->extent_op->flags_to_set |=
ref->extent_op->flags_to_set;
- existing_ref->extent_op->update_flags = 1;
+ existing_ref->extent_op->update_flags = true;
}
btrfs_free_delayed_extent_op(ref->extent_op);
}
@@ -929,14 +929,10 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
void btrfs_delayed_ref_exit(void)
{
- if (btrfs_delayed_ref_head_cachep)
- kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
- if (btrfs_delayed_tree_ref_cachep)
- kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
- if (btrfs_delayed_data_ref_cachep)
- kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
- if (btrfs_delayed_extent_op_cachep)
- kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+ kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+ kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+ kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+ kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
}
int btrfs_delayed_ref_init(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 00ed02cbf3e9..c24b653c7343 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -75,11 +75,11 @@ struct btrfs_delayed_ref_node {
struct btrfs_delayed_extent_op {
struct btrfs_disk_key key;
+ u8 level;
+ bool update_key;
+ bool update_flags;
+ bool is_data;
u64 flags_to_set;
- int level;
- unsigned int update_key:1;
- unsigned int update_flags:1;
- unsigned int is_data:1;
};
/*
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1e668fb7dd4c..26bcb487f958 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -202,13 +202,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_dev_replace_item *ptr;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 0);
if (!dev_replace->is_valid ||
!dev_replace->item_needs_writeback) {
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 0);
return 0;
}
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 0);
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
@@ -264,7 +264,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_dev_replace_item);
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 1);
if (dev_replace->srcdev)
btrfs_set_dev_replace_src_devid(eb, ptr,
dev_replace->srcdev->devid);
@@ -287,7 +287,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
btrfs_set_dev_replace_cursor_right(eb, ptr,
dev_replace->cursor_right);
dev_replace->item_needs_writeback = 0;
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
btrfs_mark_buffer_dirty(eb);
@@ -356,7 +356,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
return PTR_ERR(trans);
}
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 1);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -394,8 +394,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
dev_replace->cursor_right = 0;
dev_replace->is_valid = 1;
dev_replace->item_needs_writeback = 1;
+ atomic64_set(&dev_replace->num_write_errors, 0);
+ atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
@@ -407,7 +409,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 1);
goto leave;
}
@@ -433,7 +435,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
leave:
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
return ret;
}
@@ -471,18 +473,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* don't allow cancel or unmount to disturb the finishing procedure */
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 0);
/* was the operation canceled, or is it finished? */
if (dev_replace->replace_state !=
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 0);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return 0;
}
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 0);
/*
* flush all outstanding I/O and inode extent mappings before the
@@ -507,7 +509,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* keep away write_all_supers() during the finishing procedure */
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
mutex_lock(&root->fs_info->chunk_mutex);
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 1);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -528,7 +530,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
rcu_str_deref(src_device->name),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
mutex_unlock(&root->fs_info->chunk_mutex);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
@@ -565,7 +567,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
btrfs_rm_dev_replace_blocked(fs_info);
@@ -614,7 +616,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
em = lookup_extent_mapping(em_tree, start, (u64)-1);
if (!em)
break;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
@@ -649,7 +651,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *srcdev;
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 0);
/* even if !dev_replace_is_valid, the values are good enough for
* the replace_status ioctl */
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -675,7 +677,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
break;
}
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 0);
}
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
@@ -698,13 +700,13 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
return -EROFS;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 1);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
goto leave;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
@@ -717,7 +719,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
dev_replace->time_stopped = get_seconds();
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
@@ -740,7 +742,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 1);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -756,7 +758,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
break;
}
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
}
@@ -766,12 +768,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
struct task_struct *task;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 1);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
return 0;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
break;
@@ -784,10 +786,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
return 0;
}
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 1);
WARN_ON(atomic_xchg(
&fs_info->mutually_exclusive_operation_running, 1));
@@ -802,7 +804,7 @@ static int btrfs_dev_replace_kthread(void *data)
struct btrfs_ioctl_dev_replace_args *status_args;
u64 progress;
- status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+ status_args = kzalloc(sizeof(*status_args), GFP_KERNEL);
if (status_args) {
btrfs_dev_replace_status(fs_info, status_args);
progress = status_args->status.progress_1000;
@@ -858,55 +860,65 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
* not called and the the filesystem is remounted
* in degraded state. This does not stop the
* dev_replace procedure. It needs to be canceled
- * manually if the cancelation is wanted.
+ * manually if the cancellation is wanted.
*/
break;
}
return 1;
}
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
{
- /* the beginning is just an optimization for the typical case */
- if (atomic_read(&dev_replace->nesting_level) == 0) {
-acquire_lock:
- /* this is not a nested case where the same thread
- * is trying to acqurire the same lock twice */
- mutex_lock(&dev_replace->lock);
- mutex_lock(&dev_replace->lock_management_lock);
- dev_replace->lock_owner = current->pid;
- atomic_inc(&dev_replace->nesting_level);
- mutex_unlock(&dev_replace->lock_management_lock);
- return;
+ if (rw == 1) {
+ /* write */
+again:
+ wait_event(dev_replace->read_lock_wq,
+ atomic_read(&dev_replace->blocking_readers) == 0);
+ write_lock(&dev_replace->lock);
+ if (atomic_read(&dev_replace->blocking_readers)) {
+ write_unlock(&dev_replace->lock);
+ goto again;
+ }
+ } else {
+ read_lock(&dev_replace->lock);
+ atomic_inc(&dev_replace->read_locks);
}
+}
- mutex_lock(&dev_replace->lock_management_lock);
- if (atomic_read(&dev_replace->nesting_level) > 0 &&
- dev_replace->lock_owner == current->pid) {
- WARN_ON(!mutex_is_locked(&dev_replace->lock));
- atomic_inc(&dev_replace->nesting_level);
- mutex_unlock(&dev_replace->lock_management_lock);
- return;
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
+{
+ if (rw == 1) {
+ /* write */
+ ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
+ write_unlock(&dev_replace->lock);
+ } else {
+ ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+ atomic_dec(&dev_replace->read_locks);
+ read_unlock(&dev_replace->lock);
}
+}
- mutex_unlock(&dev_replace->lock_management_lock);
- goto acquire_lock;
+/* inc blocking cnt and release read lock */
+void btrfs_dev_replace_set_lock_blocking(
+ struct btrfs_dev_replace *dev_replace)
+{
+ /* only set blocking for read lock */
+ ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+ atomic_inc(&dev_replace->blocking_readers);
+ read_unlock(&dev_replace->lock);
}
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+/* acquire read lock and dec blocking cnt */
+void btrfs_dev_replace_clear_lock_blocking(
+ struct btrfs_dev_replace *dev_replace)
{
- WARN_ON(!mutex_is_locked(&dev_replace->lock));
- mutex_lock(&dev_replace->lock_management_lock);
- WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
- WARN_ON(dev_replace->lock_owner != current->pid);
- atomic_dec(&dev_replace->nesting_level);
- if (atomic_read(&dev_replace->nesting_level) == 0) {
- dev_replace->lock_owner = 0;
- mutex_unlock(&dev_replace->lock_management_lock);
- mutex_unlock(&dev_replace->lock);
- } else {
- mutex_unlock(&dev_replace->lock_management_lock);
- }
+ /* only set blocking for read lock */
+ ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+ ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
+ read_lock(&dev_replace->lock);
+ if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
+ waitqueue_active(&dev_replace->read_lock_wq))
+ wake_up(&dev_replace->read_lock_wq);
}
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 20035cbbf021..29e3ef5f96bd 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -34,8 +34,11 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
+void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_clear_lock_blocking(
+ struct btrfs_dev_replace *dev_replace);
static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
{
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2d4667594681..4e47849d7427 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -25,7 +25,6 @@
#include <linux/buffer_head.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
-#include <linux/freezer.h>
#include <linux/slab.h>
#include <linux/migrate.h>
#include <linux/ratelimit.h>
@@ -42,6 +41,7 @@
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
+#include "free-space-tree.h"
#include "inode-map.h"
#include "check-integrity.h"
#include "rcu-string.h"
@@ -49,11 +49,18 @@
#include "raid56.h"
#include "sysfs.h"
#include "qgroup.h"
+#include "compression.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
#endif
+#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
+ BTRFS_HEADER_FLAG_RELOC |\
+ BTRFS_SUPER_FLAG_ERROR |\
+ BTRFS_SUPER_FLAG_SEEDING |\
+ BTRFS_SUPER_FLAG_METADUMP)
+
static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
@@ -103,8 +110,7 @@ int __init btrfs_end_io_wq_init(void)
void btrfs_end_io_wq_exit(void)
{
- if (btrfs_end_io_wq_cache)
- kmem_cache_destroy(btrfs_end_io_wq_cache);
+ kmem_cache_destroy(btrfs_end_io_wq_cache);
}
/*
@@ -175,6 +181,7 @@ static struct btrfs_lockdep_keyset {
{ .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
{ .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" },
+ { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
{ .id = 0, .name_stem = "tree" },
};
@@ -295,7 +302,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
err = map_private_extent_buffer(buf, offset, 32,
&kaddr, &map_start, &map_len);
if (err)
- return 1;
+ return err;
cur_len = min(len, map_len - (offset - map_start));
crc = btrfs_csum_data(kaddr + offset - map_start,
crc, cur_len);
@@ -305,7 +312,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
if (csum_size > sizeof(inline_result)) {
result = kzalloc(csum_size, GFP_NOFS);
if (!result)
- return 1;
+ return -ENOMEM;
} else {
result = (char *)&inline_result;
}
@@ -326,7 +333,7 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
val, found, btrfs_header_level(buf));
if (result != (char *)&inline_result)
kfree(result);
- return 1;
+ return -EUCLEAN;
}
} else {
write_extent_buffer(buf, result, 0, csum_size);
@@ -362,7 +369,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
}
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
- 0, &cached_state);
+ &cached_state);
if (extent_buffer_uptodate(eb) &&
btrfs_header_generation(eb) == parent_transid) {
ret = 0;
@@ -505,11 +512,21 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
eb = (struct extent_buffer *)page->private;
if (page != eb->pages[0])
return 0;
+
found_start = btrfs_header_bytenr(eb);
- if (WARN_ON(found_start != start || !PageUptodate(page)))
- return 0;
- csum_tree_block(fs_info, eb, 0);
- return 0;
+ /*
+ * Please do not consolidate these warnings into a single if.
+ * It is useful to know what went wrong.
+ */
+ if (WARN_ON(found_start != start))
+ return -EUCLEAN;
+ if (WARN_ON(!PageUptodate(page)))
+ return -EUCLEAN;
+
+ ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
+ btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
+
+ return csum_tree_block(fs_info, eb, 0);
}
static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
@@ -604,6 +621,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
int found_level;
struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int reads_done;
@@ -629,21 +647,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
- found_start, eb->start);
+ btrfs_err_rl(fs_info, "bad tree block start %llu %llu",
+ found_start, eb->start);
ret = -EIO;
goto err;
}
- if (check_tree_block_fsid(root->fs_info, eb)) {
- btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
- eb->start);
+ if (check_tree_block_fsid(fs_info, eb)) {
+ btrfs_err_rl(fs_info, "bad fsid on block %llu",
+ eb->start);
ret = -EIO;
goto err;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_err(root->fs_info, "bad tree block level %d",
- (int)btrfs_header_level(eb));
+ btrfs_err(fs_info, "bad tree block level %d",
+ (int)btrfs_header_level(eb));
ret = -EIO;
goto err;
}
@@ -651,11 +669,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
eb, found_level);
- ret = csum_tree_block(root->fs_info, eb, 1);
- if (ret) {
- ret = -EIO;
+ ret = csum_tree_block(fs_info, eb, 1);
+ if (ret)
goto err;
- }
/*
* If this is a leaf block and it is corrupt, set the corrupt bit so
@@ -672,7 +688,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(root, eb, eb->start, ret);
+ btree_readahead_hook(fs_info, eb, eb->start, ret);
if (ret) {
/*
@@ -691,14 +707,13 @@ out:
static int btree_io_failed_hook(struct page *page, int failed_mirror)
{
struct extent_buffer *eb;
- struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
eb = (struct extent_buffer *)page->private;
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = failed_mirror;
atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
- btree_readahead_hook(root, eb, eb->start, -EIO);
+ btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
return -EIO; /* we fixed nothing */
}
@@ -808,7 +823,7 @@ static void run_one_async_done(struct btrfs_work *work)
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
- /* If an error occured we just want to clean up the bio and move on */
+ /* If an error occurred we just want to clean up the bio and move on */
if (async->error) {
async->bio->bi_error = async->error;
bio_endio(async->bio);
@@ -923,7 +938,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
if (bio_flags & EXTENT_BIO_TREE_LOG)
return 0;
#ifdef CONFIG_X86
- if (cpu_has_xmm4_2)
+ if (static_cpu_has(X86_FEATURE_XMM4_2))
return 0;
#endif
return 1;
@@ -1047,7 +1062,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
(unsigned long long)page_offset(page));
ClearPagePrivate(page);
set_page_private(page, 0);
- page_cache_release(page);
+ put_page(page);
}
}
@@ -1288,9 +1303,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
spin_lock_init(&root->root_item_lock);
}
-static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
+ gfp_t flags)
{
- struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+ struct btrfs_root *root = kzalloc(sizeof(*root), flags);
if (root)
root->fs_info = fs_info;
return root;
@@ -1302,7 +1318,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
{
struct btrfs_root *root;
- root = btrfs_alloc_root(NULL);
+ root = btrfs_alloc_root(NULL, GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
__setup_root(4096, 4096, 4096, root, NULL, 1);
@@ -1324,7 +1340,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
int ret = 0;
uuid_le uuid;
- root = btrfs_alloc_root(fs_info);
+ root = btrfs_alloc_root(fs_info, GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
@@ -1400,7 +1416,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
- root = btrfs_alloc_root(fs_info);
+ root = btrfs_alloc_root(fs_info, GFP_NOFS);
if (!root)
return ERR_PTR(-ENOMEM);
@@ -1498,7 +1514,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
if (!path)
return ERR_PTR(-ENOMEM);
- root = btrfs_alloc_root(fs_info);
+ root = btrfs_alloc_root(fs_info, GFP_NOFS);
if (!root) {
ret = -ENOMEM;
goto alloc_fail;
@@ -1582,8 +1598,23 @@ int btrfs_init_fs_root(struct btrfs_root *root)
ret = get_anon_bdev(&root->anon_dev);
if (ret)
goto free_writers;
+
+ mutex_lock(&root->objectid_mutex);
+ ret = btrfs_find_highest_objectid(root,
+ &root->highest_objectid);
+ if (ret) {
+ mutex_unlock(&root->objectid_mutex);
+ goto free_root_dev;
+ }
+
+ ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ mutex_unlock(&root->objectid_mutex);
+
return 0;
+free_root_dev:
+ free_anon_bdev(root->anon_dev);
free_writers:
btrfs_free_subvolume_writers(root->subv_writers);
fail:
@@ -1650,6 +1681,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
return fs_info->uuid_root ? fs_info->uuid_root :
ERR_PTR(-ENOENT);
+ if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+ return fs_info->free_space_root ? fs_info->free_space_root :
+ ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, location->objectid);
if (root) {
@@ -1730,7 +1764,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
if (err)
return err;
- bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
+ bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
bdi->congested_fn = btrfs_congested_fn;
bdi->congested_data = info;
bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
@@ -1762,7 +1796,6 @@ static int cleaner_kthread(void *arg)
int again;
struct btrfs_trans_handle *trans;
- set_freezable();
do {
again = 0;
@@ -1782,7 +1815,10 @@ static int cleaner_kthread(void *arg)
goto sleep;
}
+ mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
btrfs_run_delayed_iputs(root);
+ mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
+
again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -1802,7 +1838,7 @@ static int cleaner_kthread(void *arg)
*/
btrfs_delete_unused_bgs(root->fs_info);
sleep:
- if (!try_to_freeze() && !again) {
+ if (!again) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
schedule();
@@ -1892,14 +1928,12 @@ sleep:
if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
&root->fs_info->fs_state)))
btrfs_cleanup_transaction(root);
- if (!try_to_freeze()) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (!kthread_should_stop() &&
- (!btrfs_transaction_blocked(root->fs_info) ||
- cannot_commit))
- schedule_timeout(delay);
- __set_current_state(TASK_RUNNING);
- }
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!kthread_should_stop() &&
+ (!btrfs_transaction_blocked(root->fs_info) ||
+ cannot_commit))
+ schedule_timeout(delay);
+ __set_current_state(TASK_RUNNING);
} while (!kthread_should_stop());
return 0;
}
@@ -2148,6 +2182,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_root_extent_buffers(info->uuid_root);
if (chunk_root)
free_root_extent_buffers(info->chunk_root);
+ free_root_extent_buffers(info->free_space_root);
}
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2243,9 +2278,11 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
fs_info->dev_replace.lock_owner = 0;
atomic_set(&fs_info->dev_replace.nesting_level, 0);
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
- mutex_init(&fs_info->dev_replace.lock_management_lock);
- mutex_init(&fs_info->dev_replace.lock);
+ rwlock_init(&fs_info->dev_replace.lock);
+ atomic_set(&fs_info->dev_replace.read_locks, 0);
+ atomic_set(&fs_info->dev_replace.blocking_readers, 0);
init_waitqueue_head(&fs_info->replace_wait);
+ init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
}
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
@@ -2356,7 +2393,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
return -EIO;
}
- log_tree_root = btrfs_alloc_root(fs_info);
+ log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
if (!log_tree_root)
return -ENOMEM;
@@ -2448,6 +2485,15 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
fs_info->uuid_root = root;
}
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+ root = btrfs_read_tree_root(tree_root, &location);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->free_space_root = root;
+ }
+
return 0;
}
@@ -2472,8 +2518,8 @@ int open_ctree(struct super_block *sb,
int backup_index = 0;
int max_active;
- tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
- chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+ tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+ chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
if (!tree_root || !chunk_root) {
err = -ENOMEM;
goto fail;
@@ -2496,7 +2542,7 @@ int open_ctree(struct super_block *sb,
err = ret;
goto fail_bdi;
}
- fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+ fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
@@ -2542,8 +2588,8 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->delete_unused_bgs_mutex);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
+ mutex_init(&fs_info->cleaner_delayed_iput_mutex);
seqlock_init(&fs_info->profiles_lock);
- init_rwsem(&fs_info->delayed_iput_sem);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
@@ -2565,6 +2611,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->qgroup_op_seq, 0);
+ atomic_set(&fs_info->reada_works_cnt, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
@@ -2575,7 +2622,7 @@ int open_ctree(struct super_block *sb,
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
/* readahead state */
- INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
spin_lock_init(&fs_info->reada_lock);
fs_info->thread_pool_size = min_t(unsigned long,
@@ -2584,7 +2631,7 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->ordered_roots);
spin_lock_init(&fs_info->ordered_root_lock);
fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
- GFP_NOFS);
+ GFP_KERNEL);
if (!fs_info->delayed_root) {
err = -ENOMEM;
goto fail_iput;
@@ -2668,6 +2715,7 @@ int open_ctree(struct super_block *sb,
if (btrfs_check_super_csum(bh->b_data)) {
printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
err = -EINVAL;
+ brelse(bh);
goto fail_alloc;
}
@@ -2711,7 +2759,7 @@ int open_ctree(struct super_block *sb,
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
- ret = btrfs_parse_options(tree_root, options);
+ ret = btrfs_parse_options(tree_root, options, sb->s_flags);
if (ret) {
err = ret;
goto fail_alloc;
@@ -2727,26 +2775,6 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- /*
- * Leafsize and nodesize were always equal, this is only a sanity check.
- */
- if (le32_to_cpu(disk_super->__unused_leafsize) !=
- btrfs_super_nodesize(disk_super)) {
- printk(KERN_ERR "BTRFS: couldn't mount because metadata "
- "blocksizes don't match. node %d leaf %d\n",
- btrfs_super_nodesize(disk_super),
- le32_to_cpu(disk_super->__unused_leafsize));
- err = -EINVAL;
- goto fail_alloc;
- }
- if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
- printk(KERN_ERR "BTRFS: couldn't mount because metadata "
- "blocksize (%d) was too large\n",
- btrfs_super_nodesize(disk_super));
- err = -EINVAL;
- goto fail_alloc;
- }
-
features = btrfs_super_incompat_flags(disk_super);
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
@@ -2759,7 +2787,7 @@ int open_ctree(struct super_block *sb,
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
*/
- if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
@@ -2809,7 +2837,7 @@ int open_ctree(struct super_block *sb,
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
- 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+ SZ_4M / PAGE_SIZE);
tree_root->nodesize = nodesize;
tree_root->sectorsize = sectorsize;
@@ -2818,17 +2846,6 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
- if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
- goto fail_sb_buffer;
- }
-
- if (sectorsize != PAGE_SIZE) {
- printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
- "found on %s\n", (unsigned long)sectorsize, sb->s_id);
- goto fail_sb_buffer;
- }
-
mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
@@ -2900,6 +2917,18 @@ retry_root_backup:
tree_root->commit_root = btrfs_root_node(tree_root);
btrfs_set_root_refs(&tree_root->root_item, 1);
+ mutex_lock(&tree_root->objectid_mutex);
+ ret = btrfs_find_highest_objectid(tree_root,
+ &tree_root->highest_objectid);
+ if (ret) {
+ mutex_unlock(&tree_root->objectid_mutex);
+ goto recovery_tree_root;
+ }
+
+ ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
+
+ mutex_unlock(&tree_root->objectid_mutex);
+
ret = btrfs_read_roots(fs_info, tree_root);
if (ret)
goto recovery_tree_root;
@@ -3009,8 +3038,9 @@ retry_root_backup:
if (ret)
goto fail_trans_kthread;
- /* do not make disk changes in broken FS */
- if (btrfs_super_log_root(disk_super) != 0) {
+ /* do not make disk changes in broken FS or nologreplay is given */
+ if (btrfs_super_log_root(disk_super) != 0 &&
+ !btrfs_test_opt(tree_root, NOLOGREPLAY)) {
ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3051,6 +3081,18 @@ retry_root_backup:
if (sb->s_flags & MS_RDONLY)
return 0;
+ if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+ !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ pr_info("BTRFS: creating free space tree\n");
+ ret = btrfs_create_free_space_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to create free space tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ }
+
down_read(&fs_info->cleanup_work_sem);
if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
@@ -3076,6 +3118,18 @@ retry_root_backup:
btrfs_qgroup_rescan_resume(fs_info);
+ if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+ btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ pr_info("BTRFS: clearing free space tree\n");
+ ret = btrfs_clear_free_space_tree(fs_info);
+ if (ret) {
+ pr_warn("BTRFS: failed to clear free space tree %d\n",
+ ret);
+ close_ctree(tree_root);
+ return ret;
+ }
+ }
+
if (!fs_info->uuid_root) {
pr_info("BTRFS: creating UUID tree\n");
ret = btrfs_create_uuid_tree(fs_info);
@@ -3102,6 +3156,12 @@ retry_root_backup:
fs_info->open = 1;
+ /*
+ * backuproot only affect mount behavior, and if open_ctree succeeded,
+ * no need to keep the flag
+ */
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+
return 0;
fail_qgroup:
@@ -3156,7 +3216,7 @@ fail:
return err;
recovery_tree_root:
- if (!btrfs_test_opt(tree_root, RECOVERY))
+ if (!btrfs_test_opt(tree_root, USEBACKUPROOT))
goto fail_tree_roots;
free_root_pointers(fs_info, 0);
@@ -3780,6 +3840,9 @@ void close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ /* wait for the qgroup rescan worker to stop */
+ btrfs_qgroup_wait_for_completion(fs_info);
+
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
/* avoid complains from lockdep et al., set sem back to initial state */
@@ -3899,11 +3962,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
return !ret;
}
-int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
-{
- return set_extent_buffer_uptodate(buf);
-}
-
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
struct btrfs_root *root;
@@ -3959,7 +4017,6 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
balance_dirty_pages_ratelimited(
root->fs_info->btree_inode->i_mapping);
}
- return;
}
void btrfs_btree_balance_dirty(struct btrfs_root *root)
@@ -3982,8 +4039,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only)
{
struct btrfs_super_block *sb = fs_info->super_copy;
+ u64 nodesize = btrfs_super_nodesize(sb);
+ u64 sectorsize = btrfs_super_sectorsize(sb);
int ret = 0;
+ if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+ printk(KERN_ERR "BTRFS: no valid FS found\n");
+ ret = -EINVAL;
+ }
+ if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)
+ printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n",
+ btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
@@ -4001,31 +4067,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
}
/*
- * The common minimum, we don't know if we can trust the nodesize/sectorsize
- * items yet, they'll be verified later. Issue just a warning.
+ * Check sectorsize and nodesize first, other check will need it.
+ * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
*/
- if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
+ if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+ sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize);
+ ret = -EINVAL;
+ }
+ /* Only PAGE SIZE is supported yet */
+ if (sectorsize != PAGE_SIZE) {
+ printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n",
+ sectorsize, PAGE_SIZE);
+ ret = -EINVAL;
+ }
+ if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
+ nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+ printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize);
+ ret = -EINVAL;
+ }
+ if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
+ printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n",
+ le32_to_cpu(sb->__unused_leafsize),
+ nodesize);
+ ret = -EINVAL;
+ }
+
+ /* Root alignment check */
+ if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
btrfs_super_root(sb));
- if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
+ ret = -EINVAL;
+ }
+ if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
btrfs_super_chunk_root(sb));
- if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
- printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
- btrfs_super_log_root(sb));
-
- /*
- * Check the lower bound, the alignment and other constraints are
- * checked later.
- */
- if (btrfs_super_nodesize(sb) < 4096) {
- printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
- btrfs_super_nodesize(sb));
ret = -EINVAL;
}
- if (btrfs_super_sectorsize(sb) < 4096) {
- printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
- btrfs_super_sectorsize(sb));
+ if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
+ printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
+ btrfs_super_log_root(sb));
ret = -EINVAL;
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index adeb31830b9c..8e79d0070bcf 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -19,7 +19,7 @@
#ifndef __DISKIO__
#define __DISKIO__
-#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
#define BTRFS_SUPER_INFO_SIZE 4096
#define BTRFS_SUPER_MIRROR_MAX 3
@@ -35,7 +35,7 @@ enum btrfs_wq_endio_type {
static inline u64 btrfs_sb_offset(int mirror)
{
- u64 start = 16 * 1024;
+ u64 start = SZ_16K;
if (mirror)
return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
return BTRFS_SUPER_INFO_OFFSET;
@@ -116,7 +116,6 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root)
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 99a8e57da8a1..84e060eb0de8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
+#include "free-space-tree.h"
#include "math.h"
#include "sysfs.h"
#include "qgroup.h"
@@ -124,7 +125,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
return (cache->flags & bits) == bits;
}
-static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
{
atomic_inc(&cache->count);
}
@@ -357,8 +358,8 @@ static void fragment_free_space(struct btrfs_root *root,
* we need to check the pinned_extents for any extents that can't be used yet
* since their free space will be released as soon as the transaction commits.
*/
-static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
- struct btrfs_fs_info *info, u64 start, u64 end)
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+ struct btrfs_fs_info *info, u64 start, u64 end)
{
u64 extent_start, extent_end, size, total_added = 0;
int ret;
@@ -395,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
return total_added;
}
-static noinline void caching_thread(struct btrfs_work *work)
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_fs_info *fs_info;
- struct btrfs_caching_control *caching_ctl;
struct btrfs_root *extent_root;
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -407,17 +407,16 @@ static noinline void caching_thread(struct btrfs_work *work)
u64 total_found = 0;
u64 last = 0;
u32 nritems;
- int ret = -ENOMEM;
+ int ret;
bool wakeup = true;
- caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
extent_root = fs_info->extent_root;
path = btrfs_alloc_path();
if (!path)
- goto out;
+ return -ENOMEM;
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -438,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work)
*/
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 1;
+ path->reada = READA_FORWARD;
key.objectid = last;
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
- mutex_lock(&caching_ctl->mutex);
- /* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->commit_root_sem);
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto err;
+ goto out;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
@@ -477,12 +472,14 @@ next:
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
cond_resched();
- goto again;
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+ goto next;
}
ret = btrfs_next_leaf(extent_root, path);
if (ret < 0)
- goto err;
+ goto out;
if (ret)
break;
leaf = path->nodes[0];
@@ -521,7 +518,7 @@ next:
else
last = key.objectid + key.offset;
- if (total_found > (1024 * 1024 * 2)) {
+ if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
if (wakeup)
wake_up(&caching_ctl->wait);
@@ -534,9 +531,37 @@ next:
total_found += add_new_free_space(block_group, fs_info, last,
block_group->key.objectid +
block_group->key.offset);
+ caching_ctl->progress = (u64)-1;
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_root *extent_root;
+ int ret;
+
+ caching_ctl = container_of(work, struct btrfs_caching_control, work);
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ extent_root = fs_info->extent_root;
+
+ mutex_lock(&caching_ctl->mutex);
+ down_read(&fs_info->commit_root_sem);
+
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ ret = load_free_space_tree(caching_ctl);
+ else
+ ret = load_extent_tree_free(caching_ctl);
+
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
- block_group->cached = BTRFS_CACHE_FINISHED;
+ block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
#ifdef CONFIG_BTRFS_DEBUG
@@ -555,20 +580,11 @@ next:
#endif
caching_ctl->progress = (u64)-1;
-err:
- btrfs_free_path(path);
- up_read(&fs_info->commit_root_sem);
-
- free_excluded_extents(extent_root, block_group);
+ up_read(&fs_info->commit_root_sem);
+ free_excluded_extents(fs_info->extent_root, block_group);
mutex_unlock(&caching_ctl->mutex);
-out:
- if (ret) {
- spin_lock(&block_group->lock);
- block_group->caching_ctl = NULL;
- block_group->cached = BTRFS_CACHE_ERROR;
- spin_unlock(&block_group->lock);
- }
+
wake_up(&caching_ctl->wait);
put_caching_control(caching_ctl);
@@ -680,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
} else {
/*
- * We are not going to do the fast caching, set cached to the
- * appropriate value and wakeup any waiters.
+ * We're either using the free space tree or no caching at all.
+ * Set cached to the appropriate value and wakeup any waiters.
*/
spin_lock(&cache->lock);
if (load_cache_only) {
@@ -2115,7 +2131,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
@@ -2141,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
/* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
@@ -2254,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
}
again:
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
path, 0, 1);
@@ -2910,6 +2926,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (trans->aborted)
return 0;
+ if (root->fs_info->creating_free_space_tree)
+ return 0;
+
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
@@ -2988,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
return -ENOMEM;
extent_op->flags_to_set = flags;
- extent_op->update_flags = 1;
- extent_op->update_key = 0;
- extent_op->is_data = is_data ? 1 : 0;
+ extent_op->update_flags = true;
+ extent_op->update_key = false;
+ extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
@@ -3328,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
* If this block group is smaller than 100 megs don't bother caching the
* block group.
*/
- if (block_group->key.offset < (100 * 1024 * 1024)) {
+ if (block_group->key.offset < (100 * SZ_1M)) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
spin_unlock(&block_group->lock);
@@ -3428,12 +3447,12 @@ again:
* taking up quite a bit since it's not folded into the other space
* cache.
*/
- num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
+ num_pages = div_u64(block_group->key.offset, SZ_256M);
if (!num_pages)
num_pages = 1;
num_pages *= 16;
- num_pages *= PAGE_CACHE_SIZE;
+ num_pages *= PAGE_SIZE;
ret = btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
@@ -3684,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
return -ENOMEM;
/*
- * We don't need the lock here since we are protected by the transaction
- * commit. We want to do the cache_save_setup first and then run the
+ * Even though we are in the critical section of the transaction commit,
+ * we can still have concurrent tasks adding elements to this
+ * transaction's list of dirty block groups. These tasks correspond to
+ * endio free space workers started when writeback finishes for a
+ * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+ * allocate new block groups as a result of COWing nodes of the root
+ * tree when updating the free space inode. The writeback for the space
+ * caches is triggered by an earlier call to
+ * btrfs_start_dirty_block_groups() and iterations of the following
+ * loop.
+ * Also we want to do the cache_save_setup first and then run the
* delayed refs to make sure we have the best chance at doing this all
* in one shot.
*/
+ spin_lock(&cur_trans->dirty_bgs_lock);
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
@@ -3700,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* finish and then do it all again
*/
if (!list_empty(&cache->io_list)) {
+ spin_unlock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->io_list);
btrfs_wait_cache_io(root, trans, cache,
&cache->io_ctl, path,
cache->key.objectid);
btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
}
/*
@@ -3712,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
* on any pending IO
*/
list_del_init(&cache->dirty_list);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
should_put = 1;
cache_save_setup(cache, trans, path);
@@ -3736,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
}
if (!ret) {
ret = write_one_cache_group(trans, root, path, cache);
+ /*
+ * One of the free space endio workers might have
+ * created a new block group while updating a free space
+ * cache's inode (at inode.c:btrfs_finish_ordered_io())
+ * and hasn't released its transaction handle yet, in
+ * which case the new block group is still attached to
+ * its transaction handle and its creation has not
+ * finished yet (no block group item in the extent tree
+ * yet, etc). If this is the case, wait for all free
+ * space endio workers to finish and retry. This is a
+ * a very rare case so no need for a more efficient and
+ * complex approach.
+ */
+ if (ret == -ENOENT) {
+ wait_event(cur_trans->writer_wait,
+ atomic_read(&cur_trans->num_writers) == 1);
+ ret = write_one_cache_group(trans, root, path,
+ cache);
+ }
if (ret)
btrfs_abort_transaction(trans, root, ret);
}
@@ -3743,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
/* if its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
+ spin_lock(&cur_trans->dirty_bgs_lock);
}
+ spin_unlock(&cur_trans->dirty_bgs_lock);
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache,
@@ -4086,8 +4139,10 @@ commit_trans:
!atomic_read(&root->fs_info->open_ioctl_trans)) {
need_commit--;
- if (need_commit > 0)
+ if (need_commit > 0) {
+ btrfs_start_delalloc_roots(fs_info, 0, -1);
btrfs_wait_ordered_roots(fs_info, -1);
+ }
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
@@ -4100,11 +4155,12 @@ commit_trans:
if (ret)
return ret;
/*
- * make sure that all running delayed iput are
- * done
+ * The cleaner kthread might still be doing iput
+ * operations. Wait for it to finish so that
+ * more space is released.
*/
- down_write(&root->fs_info->delayed_iput_sem);
- up_write(&root->fs_info->delayed_iput_sem);
+ mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
+ mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
goto again;
} else {
btrfs_end_transaction(trans, root);
@@ -4239,14 +4295,13 @@ static int should_alloc_chunk(struct btrfs_root *root,
*/
if (force == CHUNK_ALLOC_LIMITED) {
thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- thresh = max_t(u64, 64 * 1024 * 1024,
- div_factor_fine(thresh, 1));
+ thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
if (num_bytes - num_allocated < thresh)
return 1;
}
- if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
+ if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -4446,7 +4501,7 @@ out:
* transaction.
*/
if (trans->can_flush_pending_bgs &&
- trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+ trans->chunk_bytes_reserved >= (u64)SZ_2M) {
btrfs_create_pending_block_groups(trans, trans->root);
btrfs_trans_release_chunk_metadata(trans);
}
@@ -4544,7 +4599,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
return nr;
}
-#define EXTENT_SIZE_PER_ITEM (256 * 1024)
+#define EXTENT_SIZE_PER_ITEM SZ_256K
/*
* shrink metadata reservation for delalloc
@@ -4584,7 +4639,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
loops = 0;
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
- nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
+ nr_pages = max_reclaim >> PAGE_SHIFT;
btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
/*
* We need to wait for the async pages to actually start before
@@ -4749,8 +4804,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
u64 expected;
u64 to_reclaim;
- to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
- 16 * 1024 * 1024);
+ to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
spin_lock(&space_info->lock);
if (can_overcommit(root, space_info, to_reclaim,
BTRFS_RESERVE_FLUSH_ALL)) {
@@ -4761,8 +4815,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
- if (can_overcommit(root, space_info, 1024 * 1024,
- BTRFS_RESERVE_FLUSH_ALL))
+ if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
@@ -4785,7 +4838,7 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
/* If we're just plain full then async reclaim just slows us down. */
- if (space_info->bytes_used >= thresh)
+ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
return 0;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
@@ -5318,29 +5371,35 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
- block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
-
- num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
- sinfo->bytes_reserved + sinfo->bytes_readonly +
- sinfo->bytes_may_use;
-
- if (sinfo->total_bytes > num_bytes) {
- num_bytes = sinfo->total_bytes - num_bytes;
- block_rsv->reserved += num_bytes;
- sinfo->bytes_may_use += num_bytes;
- trace_btrfs_space_reservation(fs_info, "space_info",
- sinfo->flags, num_bytes, 1);
- }
-
- if (block_rsv->reserved >= block_rsv->size) {
+ block_rsv->size = min_t(u64, num_bytes, SZ_512M);
+
+ if (block_rsv->reserved < block_rsv->size) {
+ num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+ sinfo->bytes_reserved + sinfo->bytes_readonly +
+ sinfo->bytes_may_use;
+ if (sinfo->total_bytes > num_bytes) {
+ num_bytes = sinfo->total_bytes - num_bytes;
+ num_bytes = min(num_bytes,
+ block_rsv->size - block_rsv->reserved);
+ block_rsv->reserved += num_bytes;
+ sinfo->bytes_may_use += num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ sinfo->flags, num_bytes,
+ 1);
+ }
+ } else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
sinfo->bytes_may_use -= num_bytes;
trace_btrfs_space_reservation(fs_info, "space_info",
sinfo->flags, num_bytes, 0);
block_rsv->reserved = block_rsv->size;
- block_rsv->full = 1;
}
+ if (block_rsv->reserved == block_rsv->size)
+ block_rsv->full = 1;
+ else
+ block_rsv->full = 0;
+
spin_unlock(&block_rsv->lock);
spin_unlock(&sinfo->lock);
}
@@ -5699,7 +5758,7 @@ out_fail:
/*
* This is tricky, but first we need to figure out how much we
- * free'd from any free-ers that occured during this
+ * free'd from any free-ers that occurred during this
* reservation, so we reset ->csum_bytes to the csum_bytes
* before we dropped our lock, and then call the free for the
* number of bytes that were freed while we were trying our
@@ -5915,19 +5974,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
- /*
- * No longer have used bytes in this block group, queue
- * it for deletion.
- */
- if (old_val == 0) {
- spin_lock(&info->unused_bgs_lock);
- if (list_empty(&cache->bg_list)) {
- btrfs_get_block_group(cache);
- list_add_tail(&cache->bg_list,
- &info->unused_bgs);
- }
- spin_unlock(&info->unused_bgs_lock);
- }
}
spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -5939,6 +5985,22 @@ static int update_block_group(struct btrfs_trans_handle *trans,
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
+ /*
+ * No longer have used bytes in this block group, queue it for
+ * deletion. We do this after adding the block group to the
+ * dirty list to avoid races between cleaner kthread and space
+ * cache writeout.
+ */
+ if (!alloc && old_val == 0) {
+ spin_lock(&info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->unused_bgs);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+ }
+
btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
@@ -6219,11 +6281,11 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
return ret;
if (ssd)
- *empty_cluster = 2 * 1024 * 1024;
+ *empty_cluster = SZ_2M;
if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
ret = &root->fs_info->meta_alloc_cluster;
if (!ssd)
- *empty_cluster = 64 * 1024;
+ *empty_cluster = SZ_64K;
} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
ret = &root->fs_info->data_alloc_cluster;
}
@@ -6435,7 +6497,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
@@ -6658,6 +6720,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
+ ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
+ num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
+ }
+
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
@@ -6955,7 +7024,7 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
int delalloc)
{
- struct btrfs_block_group_cache *used_bg;
+ struct btrfs_block_group_cache *used_bg = NULL;
bool locked = false;
again:
spin_lock(&cluster->refill_lock);
@@ -7669,6 +7738,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
+ ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ ins->offset);
+ if (ret)
+ return ret;
+
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7749,6 +7823,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
+ ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+ num_bytes);
+ if (ret)
+ return ret;
+
ret = update_block_group(trans, root, ins->objectid, root->nodesize,
1);
if (ret) { /* -ENOENT, logic error */
@@ -7831,7 +7910,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
btrfs_set_lock_blocking(buf);
- btrfs_set_buffer_uptodate(buf);
+ set_extent_buffer_uptodate(buf);
if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
buf->log_index = root->log_transid % 2;
@@ -7977,12 +8056,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
else
memset(&extent_op->key, 0, sizeof(extent_op->key));
extent_op->flags_to_set = flags;
- if (skinny_metadata)
- extent_op->update_key = 0;
- else
- extent_op->update_key = 1;
- extent_op->update_flags = 1;
- extent_op->is_data = 0;
+ extent_op->update_key = skinny_metadata ? false : true;
+ extent_op->update_flags = true;
+ extent_op->is_data = false;
extent_op->level = level;
ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
@@ -8105,21 +8181,47 @@ reada:
}
/*
- * TODO: Modify related function to add related node/leaf to dirty_extent_root,
- * for later qgroup accounting.
- *
- * Current, this function does nothing.
+ * These may not be seen by the usual inc/dec ref code so we have to
+ * add them here.
*/
+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_qgroup_extent_record *qrecord;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+ if (!qrecord)
+ return -ENOMEM;
+
+ qrecord->bytenr = bytenr;
+ qrecord->num_bytes = num_bytes;
+ qrecord->old_roots = NULL;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+ kfree(qrecord);
+ spin_unlock(&delayed_refs->lock);
+
+ return 0;
+}
+
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
{
int nr = btrfs_header_nritems(eb);
- int i, extent_type;
+ int i, extent_type, ret;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
+ /* We can be called directly from walk_up_proc() */
+ if (!root->fs_info->quota_enabled)
+ return 0;
+
for (i = 0; i < nr; i++) {
btrfs_item_key_to_cpu(eb, &key, i);
@@ -8138,6 +8240,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+ ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -8206,8 +8312,6 @@ static int adjust_slots_upwards(struct btrfs_root *root,
/*
* root_eb is the subtree root and is locked before this function is called.
- * TODO: Modify this function to mark all (including complete shared node)
- * to dirty_extent_root to allow it get accounted in qgroup.
*/
static int account_shared_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -8285,6 +8389,11 @@ walk_down:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+ ret = record_one_subtree_extent(trans, root, child_bytenr,
+ root->nodesize);
+ if (ret)
+ goto out;
}
if (level == 0) {
@@ -9088,7 +9197,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
if ((sinfo->flags &
(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
!force)
- min_allocable_bytes = 1 * 1024 * 1024;
+ min_allocable_bytes = SZ_1M;
else
min_allocable_bytes = 0;
@@ -9277,15 +9386,23 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
u64 dev_min = 1;
u64 dev_nr = 0;
u64 target;
+ int debug;
int index;
int full = 0;
int ret = 0;
+ debug = btrfs_test_opt(root, ENOSPC_DEBUG);
+
block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
/* odd, couldn't find the block group, leave it alone */
- if (!block_group)
+ if (!block_group) {
+ if (debug)
+ btrfs_warn(root->fs_info,
+ "can't find block group for bytenr %llu",
+ bytenr);
return -1;
+ }
min_free = btrfs_block_group_used(&block_group->item);
@@ -9339,8 +9456,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* this is just a balance, so if we were marked as full
* we know there is no space for a new chunk
*/
- if (full)
+ if (full) {
+ if (debug)
+ btrfs_warn(root->fs_info,
+ "no space to alloc new chunk for block group %llu",
+ block_group->key.objectid);
goto out;
+ }
index = get_block_group_index(block_group);
}
@@ -9387,6 +9509,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
ret = -1;
}
}
+ if (debug && ret == -1)
+ btrfs_warn(root->fs_info,
+ "no space to allocate a new chunk for block group %llu",
+ block_group->key.objectid);
mutex_unlock(&root->fs_info->chunk_mutex);
btrfs_end_transaction(trans, root);
out:
@@ -9620,6 +9746,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
cache->full_stripe_len = btrfs_full_stripe_len(root,
&root->fs_info->mapping_tree,
start);
+ set_free_space_tree_thresholds(cache);
+
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
@@ -9631,6 +9759,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
+ mutex_init(&cache->free_space_lock);
return cache;
}
@@ -9655,7 +9784,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
if (btrfs_test_opt(root, SPACE_CACHE) &&
@@ -9841,6 +9970,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
key.objectid, key.offset);
if (ret)
btrfs_abort_transaction(trans, extent_root, ret);
+ add_block_group_free_space(trans, root->fs_info, block_group);
+ /* already aborted the transaction if it failed. */
next:
list_del_init(&block_group->bg_list);
}
@@ -9871,6 +10002,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ cache->needs_free_space = 1;
ret = exclude_super_stripes(root, cache);
if (ret) {
/*
@@ -10241,6 +10373,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
unlock_chunks(root);
+ ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+ if (ret)
+ goto out;
+
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -10256,6 +10392,47 @@ out:
return ret;
}
+struct btrfs_trans_handle *
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+ const u64 chunk_offset)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ unsigned int num_items;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+ read_unlock(&em_tree->lock);
+ ASSERT(em && em->start == chunk_offset);
+
+ /*
+ * We need to reserve 3 + N units from the metadata space info in order
+ * to remove a block group (done at btrfs_remove_chunk() and at
+ * btrfs_remove_block_group()), which are used for:
+ *
+ * 1 unit for adding the free space inode's orphan (located in the tree
+ * of tree roots).
+ * 1 unit for deleting the block group item (located in the extent
+ * tree).
+ * 1 unit for deleting the free space item (located in tree of tree
+ * roots).
+ * N units for deleting N device extent items corresponding to each
+ * stripe (located in the device tree).
+ *
+ * In order to remove a block group we also need to reserve units in the
+ * system space info in order to update the chunk tree (update one or
+ * more device items and remove one chunk item), but this is done at
+ * btrfs_remove_chunk() through a call to check_system_chunk().
+ */
+ map = em->map_lookup;
+ num_items = 3 + map->num_stripes;
+ free_extent_map(em);
+
+ return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+ num_items, 1);
+}
+
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
@@ -10279,22 +10456,25 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group_cache,
bg_list);
- space_info = block_group->space_info;
list_del_init(&block_group->bg_list);
+
+ space_info = block_group->space_info;
+
if (ret || btrfs_mixed_space_info(space_info)) {
btrfs_put_block_group(block_group);
continue;
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
if (block_group->reserved ||
btrfs_block_group_used(&block_group->item) ||
- block_group->ro) {
+ block_group->ro ||
+ list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
@@ -10319,8 +10499,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
*/
- /* 1 for btrfs_orphan_reserve_metadata() */
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_trans_remove_block_group(fs_info,
+ block_group->key.objectid);
if (IS_ERR(trans)) {
btrfs_dec_block_group_ro(root, block_group);
ret = PTR_ERR(trans);
@@ -10400,17 +10580,21 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* until transaction commit to do the actual discard.
*/
if (trimming) {
- WARN_ON(!list_empty(&block_group->bg_list));
- spin_lock(&trans->transaction->deleted_bgs_lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ /*
+ * A concurrent scrub might have added us to the list
+ * fs_info->unused_bgs, so use a list_move operation
+ * to add the block group to the deleted_bgs list.
+ */
list_move(&block_group->bg_list,
&trans->transaction->deleted_bgs);
- spin_unlock(&trans->transaction->deleted_bgs_lock);
+ spin_unlock(&fs_info->unused_bgs_lock);
btrfs_get_block_group(block_group);
}
end_trans:
btrfs_end_transaction(trans, root);
next:
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
@@ -10428,7 +10612,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
- return 1;
+ return -EINVAL;
features = btrfs_super_incompat_flags(disk_super);
if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
@@ -10658,3 +10842,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
}
return 1;
}
+
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+ schedule();
+ return 0;
+}
+
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
+{
+ while (true) {
+ int ret;
+
+ ret = btrfs_start_write_no_snapshoting(root);
+ if (ret)
+ break;
+ wait_on_atomic_t(&root->will_be_snapshoted,
+ wait_snapshoting_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+ }
+}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/fs/btrfs/extent-tree.h
+++ /dev/null
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 33a01ea41465..d247fc0eea19 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -206,10 +206,8 @@ void extent_io_exit(void)
* destroy caches.
*/
rcu_barrier();
- if (extent_state_cache)
- kmem_cache_destroy(extent_state_cache);
- if (extent_buffer_cache)
- kmem_cache_destroy(extent_buffer_cache);
+ kmem_cache_destroy(extent_state_cache);
+ kmem_cache_destroy(extent_buffer_cache);
if (btrfs_bioset)
bioset_free(btrfs_bioset);
}
@@ -232,7 +230,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
if (!state)
return state;
state->state = 0;
- state->private = 0;
+ state->failrec = NULL;
RB_CLEAR_NODE(&state->rb_node);
btrfs_leak_debug_add(&state->leak_list, &states);
atomic_set(&state->refs, 1);
@@ -616,7 +614,7 @@ static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
@@ -741,7 +739,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
@@ -874,7 +872,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
bits |= EXTENT_FIRST_DELALLOC;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
prealloc = alloc_extent_state(mask);
BUG_ON(!prealloc);
}
@@ -1052,7 +1050,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
@@ -1100,7 +1098,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Best effort, don't worry if extent state allocation fails
* here for the first iteration. We might have a cached state
@@ -1278,27 +1276,13 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
first_iteration = false;
goto again;
}
/* wrappers around set/clear extent bit */
-int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
- NULL, mask);
-}
-
-int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, bits, NULL,
- NULL, mask);
-}
-
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask,
struct extent_changeset *changeset)
@@ -1323,17 +1307,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
cached, mask, NULL);
}
-int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask)
-{
- int wake = 0;
-
- if (bits & EXTENT_LOCKED)
- wake = 1;
-
- return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
-}
-
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask,
struct extent_changeset *changeset)
@@ -1348,63 +1321,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
changeset);
}
-int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE,
- NULL, cached_state, mask);
-}
-
-int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
- NULL, cached_state, mask);
-}
-
-int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
-}
-
-int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
- NULL, mask);
-}
-
-int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
- cached_state, mask);
-}
-
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
- cached_state, mask);
-}
-
/*
* either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_state **cached_state)
+ struct extent_state **cached_state)
{
int err;
u64 failed_start;
while (1) {
- err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
EXTENT_LOCKED, &failed_start,
cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
@@ -1417,11 +1345,6 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
return err;
}
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return lock_extent_bits(tree, start, end, 0, NULL);
-}
-
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
int err;
@@ -1438,39 +1361,25 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
return 1;
}
-int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached, gfp_t mask)
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
- mask);
-}
-
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
- GFP_NOFS);
-}
-
-int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
-{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
while (index <= end_index) {
page = find_get_page(inode->i_mapping, index);
BUG_ON(!page); /* Pages should be in the extent_io_tree */
clear_page_dirty_for_io(page);
- page_cache_release(page);
+ put_page(page);
index++;
}
- return 0;
}
-int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
while (index <= end_index) {
@@ -1478,29 +1387,27 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
BUG_ON(!page); /* Pages should be in the extent_io_tree */
__set_page_dirty_nobuffers(page);
account_page_redirty(page);
- page_cache_release(page);
+ put_page(page);
index++;
}
- return 0;
}
/*
* helper function to set both pages and extents in the tree writeback
*/
-static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
struct page *page;
while (index <= end_index) {
page = find_get_page(tree->mapping, index);
BUG_ON(!page); /* Pages should be in the extent_io_tree */
set_page_writeback(page);
- page_cache_release(page);
+ put_page(page);
index++;
}
- return 0;
}
/* find the first state struct with 'bits' set after 'start', and
@@ -1649,8 +1556,8 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
{
int ret;
struct page *pages[16];
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
unsigned long nr_pages = end_index - index + 1;
int i;
@@ -1664,7 +1571,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
for (i = 0; i < ret; i++) {
if (pages[i] != locked_page)
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
nr_pages -= ret;
index += ret;
@@ -1677,9 +1584,9 @@ static noinline int lock_delalloc_pages(struct inode *inode,
u64 delalloc_start,
u64 delalloc_end)
{
- unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+ unsigned long index = delalloc_start >> PAGE_SHIFT;
unsigned long start_index = index;
- unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = delalloc_end >> PAGE_SHIFT;
unsigned long pages_locked = 0;
struct page *pages[16];
unsigned long nrpages;
@@ -1712,11 +1619,11 @@ static noinline int lock_delalloc_pages(struct inode *inode,
pages[i]->mapping != inode->i_mapping) {
ret = -EAGAIN;
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
goto done;
}
}
- page_cache_release(pages[i]);
+ put_page(pages[i]);
pages_locked++;
}
nrpages -= ret;
@@ -1729,7 +1636,7 @@ done:
__unlock_for_delalloc(inode, locked_page,
delalloc_start,
((u64)(start_index + pages_locked - 1)) <<
- PAGE_CACHE_SHIFT);
+ PAGE_SHIFT);
}
return ret;
}
@@ -1789,7 +1696,7 @@ again:
free_extent_state(cached_state);
cached_state = NULL;
if (!loops) {
- max_bytes = PAGE_CACHE_SIZE;
+ max_bytes = PAGE_SIZE;
loops = 1;
goto again;
} else {
@@ -1800,7 +1707,7 @@ again:
BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
/* step three, lock the state bits for the whole range */
- lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
+ lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1820,7 +1727,7 @@ out_failed:
return found;
}
-int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned clear_bits,
unsigned long page_ops)
@@ -1828,14 +1735,14 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
int ret;
struct page *pages[16];
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
unsigned long nr_pages = end_index - index + 1;
int i;
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
if (page_ops == 0)
- return 0;
+ return;
if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
mapping_set_error(inode->i_mapping, -EIO);
@@ -1850,7 +1757,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
SetPagePrivate2(pages[i]);
if (pages[i] == locked_page) {
- page_cache_release(pages[i]);
+ put_page(pages[i]);
continue;
}
if (page_ops & PAGE_CLEAR_DIRTY)
@@ -1863,13 +1770,12 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
end_page_writeback(pages[i]);
if (page_ops & PAGE_UNLOCK)
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
nr_pages -= ret;
index += ret;
cond_resched();
}
- return 0;
}
/*
@@ -1936,7 +1842,8 @@ out:
* set the private field for a given byte offset in the tree. If there isn't
* an extent_state there already, this does nothing.
*/
-static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record *failrec)
{
struct rb_node *node;
struct extent_state *state;
@@ -1957,13 +1864,14 @@ static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private
ret = -ENOENT;
goto out;
}
- state->private = private;
+ state->failrec = failrec;
out:
spin_unlock(&tree->lock);
return ret;
}
-int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
+ struct io_failure_record **failrec)
{
struct rb_node *node;
struct extent_state *state;
@@ -1984,7 +1892,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
ret = -ENOENT;
goto out;
}
- *private = state->private;
+ *failrec = state->failrec;
out:
spin_unlock(&tree->lock);
return ret;
@@ -2053,7 +1961,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
{
u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 end = start + PAGE_SIZE - 1;
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
}
@@ -2064,7 +1972,7 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec)
int err = 0;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
- set_state_private(failure_tree, rec->start, 0);
+ set_state_failrec(failure_tree, rec->start, NULL);
ret = clear_extent_bits(failure_tree, rec->start,
rec->start + rec->len - 1,
EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
@@ -2163,11 +2071,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
struct page *p = eb->pages[i];
ret = repair_io_failure(root->fs_info->btree_inode, start,
- PAGE_CACHE_SIZE, start, p,
+ PAGE_SIZE, start, p,
start - page_offset(p), mirror_num);
if (ret)
break;
- start += PAGE_CACHE_SIZE;
+ start += PAGE_SIZE;
}
return ret;
@@ -2181,7 +2089,6 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
unsigned int pg_offset)
{
u64 private;
- u64 private_failure;
struct io_failure_record *failrec;
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct extent_state *state;
@@ -2194,12 +2101,11 @@ int clean_io_failure(struct inode *inode, u64 start, struct page *page,
if (!ret)
return 0;
- ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
- &private_failure);
+ ret = get_state_failrec(&BTRFS_I(inode)->io_failure_tree, start,
+ &failrec);
if (ret)
return 0;
- failrec = (struct io_failure_record *)(unsigned long) private_failure;
BUG_ON(!failrec->this_mirror);
if (failrec->in_validation) {
@@ -2259,7 +2165,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
next = next_state(state);
- failrec = (struct io_failure_record *)(unsigned long)state->private;
+ failrec = state->failrec;
free_extent_state(state);
kfree(failrec);
@@ -2269,10 +2175,9 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
}
int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
- struct io_failure_record **failrec_ret)
+ struct io_failure_record **failrec_ret)
{
struct io_failure_record *failrec;
- u64 private;
struct extent_map *em;
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -2280,7 +2185,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
int ret;
u64 logical;
- ret = get_state_private(failure_tree, start, &private);
+ ret = get_state_failrec(failure_tree, start, &failrec);
if (ret) {
failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
if (!failrec)
@@ -2329,8 +2234,7 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
ret = set_extent_bits(failure_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
if (ret >= 0)
- ret = set_state_private(failure_tree, start,
- (u64)(unsigned long)failrec);
+ ret = set_state_failrec(failure_tree, start, failrec);
/* set the bits in the inode's tree */
if (ret >= 0)
ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
@@ -2340,7 +2244,6 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
return ret;
}
} else {
- failrec = (struct io_failure_record *)(unsigned long)private;
pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
failrec->logical, failrec->start, failrec->len,
failrec->in_validation);
@@ -2516,7 +2419,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
/* lots and lots of room for performance fixes in the end_bio funcs */
-int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
int uptodate = (err == 0);
struct extent_io_tree *tree;
@@ -2537,7 +2440,6 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
ret = ret < 0 ? ret : -EIO;
mapping_set_error(page->mapping, ret);
}
- return 0;
}
/*
@@ -2564,8 +2466,8 @@ static void end_bio_extent_writepage(struct bio *bio)
* advance bv_offset and adjust bv_len to compensate.
* Print a warning for nonzero offsets, and an error
* if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+ if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
+ if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
"partial page write in btrfs with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
@@ -2579,9 +2481,7 @@ static void end_bio_extent_writepage(struct bio *bio)
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
- if (end_extent_writepage(page, bio->bi_error, start, end))
- continue;
-
+ end_extent_writepage(page, bio->bi_error, start, end);
end_page_writeback(page);
}
@@ -2641,8 +2541,8 @@ static void end_bio_extent_readpage(struct bio *bio)
* advance bv_offset and adjust bv_len to compensate.
* Print a warning for nonzero offsets, and an error
* if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
- if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+ if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
+ if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
"partial page read in btrfs with offset %u and length %u",
bvec->bv_offset, bvec->bv_len);
@@ -2698,13 +2598,13 @@ static void end_bio_extent_readpage(struct bio *bio)
readpage_ok:
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned off;
/* Zero out the end if this page straddles i_size */
- off = i_size & (PAGE_CACHE_SIZE-1);
+ off = i_size & (PAGE_SIZE-1);
if (page->index == end_index && off)
- zero_user_segment(page, off, PAGE_CACHE_SIZE);
+ zero_user_segment(page, off, PAGE_SIZE);
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
@@ -2868,7 +2768,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
struct bio *bio;
int contig = 0;
int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
- size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+ size_t page_size = min_t(size_t, size, PAGE_SIZE);
if (bio_ret && *bio_ret) {
bio = *bio_ret;
@@ -2921,7 +2821,7 @@ static void attach_extent_buffer_page(struct extent_buffer *eb,
{
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
set_page_private(page, (unsigned long)eb);
} else {
WARN_ON(page->private != (unsigned long)eb);
@@ -2932,7 +2832,7 @@ void set_page_extent_mapped(struct page *page)
{
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
set_page_private(page, EXTENT_PAGE_PRIVATE);
}
}
@@ -2980,7 +2880,7 @@ static int __do_readpage(struct extent_io_tree *tree,
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 page_end = start + PAGE_SIZE - 1;
u64 end;
u64 cur = start;
u64 extent_offset;
@@ -2992,12 +2892,11 @@ static int __do_readpage(struct extent_io_tree *tree,
struct block_device *bdev;
int ret;
int nr = 0;
- int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
size_t pg_offset = 0;
size_t iosize;
size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
- unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+ unsigned long this_bio_flag = 0;
set_page_extent_mapped(page);
@@ -3010,12 +2909,12 @@ static int __do_readpage(struct extent_io_tree *tree,
}
}
- if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+ if (page->index == last_byte >> PAGE_SHIFT) {
char *userpage;
- size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+ size_t zero_offset = last_byte & (PAGE_SIZE - 1);
if (zero_offset) {
- iosize = PAGE_CACHE_SIZE - zero_offset;
+ iosize = PAGE_SIZE - zero_offset;
userpage = kmap_atomic(page);
memset(userpage + zero_offset, 0, iosize);
flush_dcache_page(page);
@@ -3023,32 +2922,30 @@ static int __do_readpage(struct extent_io_tree *tree,
}
}
while (cur <= end) {
- unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+ unsigned long pnr = (last_byte >> PAGE_SHIFT) + 1;
bool force_bio_submit = false;
if (cur >= last_byte) {
char *userpage;
struct extent_state *cached = NULL;
- iosize = PAGE_CACHE_SIZE - pg_offset;
+ iosize = PAGE_SIZE - pg_offset;
userpage = kmap_atomic(page);
memset(userpage + pg_offset, 0, iosize);
flush_dcache_page(page);
kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- if (!parent_locked)
- unlock_extent_cached(tree, cur,
- cur + iosize - 1,
- &cached, GFP_NOFS);
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, get_extent, em_cached);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
- if (!parent_locked)
- unlock_extent(tree, cur, end);
+ unlock_extent(tree, cur, end);
break;
}
extent_offset = cur - em->start;
@@ -3133,12 +3030,9 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- if (parent_locked)
- free_extent_state(cached);
- else
- unlock_extent_cached(tree, cur,
- cur + iosize - 1,
- &cached, GFP_NOFS);
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3147,8 +3041,7 @@ static int __do_readpage(struct extent_io_tree *tree,
if (test_range_bit(tree, cur, cur_end,
EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
- if (!parent_locked)
- unlock_extent(tree, cur, cur + iosize - 1);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3158,8 +3051,7 @@ static int __do_readpage(struct extent_io_tree *tree,
*/
if (block_start == EXTENT_MAP_INLINE) {
SetPageError(page);
- if (!parent_locked)
- unlock_extent(tree, cur, cur + iosize - 1);
+ unlock_extent(tree, cur, cur + iosize - 1);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3178,8 +3070,7 @@ static int __do_readpage(struct extent_io_tree *tree,
*bio_flags = this_bio_flag;
} else {
SetPageError(page);
- if (!parent_locked)
- unlock_extent(tree, cur, cur + iosize - 1);
+ unlock_extent(tree, cur, cur + iosize - 1);
}
cur = cur + iosize;
pg_offset += iosize;
@@ -3221,7 +3112,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
for (index = 0; index < nr_pages; index++) {
__do_readpage(tree, pages[index], get_extent, em_cached, bio,
mirror_num, bio_flags, rw, prev_em_start);
- page_cache_release(pages[index]);
+ put_page(pages[index]);
}
}
@@ -3243,10 +3134,10 @@ static void __extent_readpages(struct extent_io_tree *tree,
page_start = page_offset(pages[index]);
if (!end) {
start = page_start;
- end = start + PAGE_CACHE_SIZE - 1;
+ end = start + PAGE_SIZE - 1;
first_index = index;
} else if (end + 1 == page_start) {
- end += PAGE_CACHE_SIZE;
+ end += PAGE_SIZE;
} else {
__do_contiguous_readpages(tree, &pages[first_index],
index - first_index, start,
@@ -3254,7 +3145,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
bio, mirror_num, bio_flags,
rw, prev_em_start);
start = page_start;
- end = start + PAGE_CACHE_SIZE - 1;
+ end = start + PAGE_SIZE - 1;
first_index = index;
}
}
@@ -3276,12 +3167,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
struct inode *inode = page->mapping->host;
struct btrfs_ordered_extent *ordered;
u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 end = start + PAGE_SIZE - 1;
int ret;
while (1) {
lock_extent(tree, start, end);
- ordered = btrfs_lookup_ordered_extent(inode, start);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ PAGE_SIZE);
if (!ordered)
break;
unlock_extent(tree, start, end);
@@ -3308,20 +3200,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
return ret;
}
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num)
-{
- struct bio *bio = NULL;
- unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
- int ret;
-
- ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
- &bio_flags, READ, NULL);
- if (bio)
- ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
- return ret;
-}
-
static noinline void update_nr_written(struct page *page,
struct writeback_control *wbc,
unsigned long nr_written)
@@ -3349,7 +3227,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
unsigned long *nr_written)
{
struct extent_io_tree *tree = epd->tree;
- u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+ u64 page_end = delalloc_start + PAGE_SIZE - 1;
u64 nr_delalloc;
u64 delalloc_to_write = 0;
u64 delalloc_end = 0;
@@ -3386,13 +3264,11 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
goto done;
}
/*
- * delalloc_end is already one less than the total
- * length, so we don't subtract one from
- * PAGE_CACHE_SIZE
+ * delalloc_end is already one less than the total length, so
+ * we don't subtract one from PAGE_SIZE
*/
delalloc_to_write += (delalloc_end - delalloc_start +
- PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
+ PAGE_SIZE) >> PAGE_SHIFT;
delalloc_start = delalloc_end + 1;
}
if (wbc->nr_to_write < delalloc_to_write) {
@@ -3441,7 +3317,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
{
struct extent_io_tree *tree = epd->tree;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 page_end = start + PAGE_SIZE - 1;
u64 end;
u64 cur = start;
u64 extent_offset;
@@ -3556,7 +3432,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
if (ret) {
SetPageError(page);
} else {
- unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
+ unsigned long max_nr = (i_size >> PAGE_SHIFT) + 1;
set_range_writeback(tree, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
@@ -3599,12 +3475,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct inode *inode = page->mapping->host;
struct extent_page_data *epd = data;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ u64 page_end = start + PAGE_SIZE - 1;
int ret;
int nr = 0;
size_t pg_offset = 0;
loff_t i_size = i_size_read(inode);
- unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = i_size >> PAGE_SHIFT;
int write_flags;
unsigned long nr_written = 0;
@@ -3619,10 +3495,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
ClearPageError(page);
- pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+ pg_offset = i_size & (PAGE_SIZE - 1);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return 0;
}
@@ -3632,7 +3508,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
userpage = kmap_atomic(page);
memset(userpage + pg_offset, 0,
- PAGE_CACHE_SIZE - pg_offset);
+ PAGE_SIZE - pg_offset);
kunmap_atomic(userpage);
flush_dcache_page(page);
}
@@ -3870,7 +3746,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
- PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+ PAGE_SIZE, 0, bdev, &epd->bio,
-1, end_bio_extent_buffer_writepage,
0, epd->bio_flags, bio_flags, false);
epd->bio_flags = bio_flags;
@@ -3882,7 +3758,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
ret = -EIO;
break;
}
- offset += PAGE_CACHE_SIZE;
+ offset += PAGE_SIZE;
update_nr_written(p, wbc, 1);
unlock_page(p);
}
@@ -3926,8 +3802,8 @@ int btree_write_cache_pages(struct address_space *mapping,
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
scanned = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -4070,8 +3946,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
scanned = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -4205,8 +4081,8 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
int ret = 0;
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = (end - start + PAGE_SIZE) >>
+ PAGE_SHIFT;
struct extent_page_data epd = {
.bio = NULL,
@@ -4224,18 +4100,18 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
};
while (start <= end) {
- page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ page = find_get_page(mapping, start >> PAGE_SHIFT);
if (clear_page_dirty_for_io(page))
ret = __extent_writepage(page, &wbc_writepages, &epd);
else {
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, start,
- start + PAGE_CACHE_SIZE - 1,
+ start + PAGE_SIZE - 1,
NULL, 1);
unlock_page(page);
}
- page_cache_release(page);
- start += PAGE_CACHE_SIZE;
+ put_page(page);
+ start += PAGE_SIZE;
}
flush_epd_write_bio(&epd);
@@ -4285,7 +4161,7 @@ int extent_readpages(struct extent_io_tree *tree,
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping,
page->index, GFP_NOFS)) {
- page_cache_release(page);
+ put_page(page);
continue;
}
@@ -4319,14 +4195,14 @@ int extent_invalidatepage(struct extent_io_tree *tree,
{
struct extent_state *cached_state = NULL;
u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 end = start + PAGE_SIZE - 1;
size_t blocksize = page->mapping->host->i_sb->s_blocksize;
start += ALIGN(offset, blocksize);
if (start > end)
return 0;
- lock_extent_bits(tree, start, end, 0, &cached_state);
+ lock_extent_bits(tree, start, end, &cached_state);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -4345,7 +4221,7 @@ static int try_release_extent_state(struct extent_map_tree *map,
struct page *page, gfp_t mask)
{
u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 end = start + PAGE_SIZE - 1;
int ret = 1;
if (test_range_bit(tree, start, end,
@@ -4384,10 +4260,10 @@ int try_release_extent_mapping(struct extent_map_tree *map,
{
struct extent_map *em;
u64 start = page_offset(page);
- u64 end = start + PAGE_CACHE_SIZE - 1;
+ u64 end = start + PAGE_SIZE - 1;
- if ((mask & __GFP_WAIT) &&
- page->mapping->host->i_size > 16 * 1024 * 1024) {
+ if (gfpflags_allow_blocking(mask) &&
+ page->mapping->host->i_size > SZ_16M) {
u64 len;
while (start <= end) {
len = end - start + 1;
@@ -4536,7 +4412,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent,
@@ -4709,14 +4585,14 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
ClearPagePrivate(page);
set_page_private(page, 0);
/* One for the page private */
- page_cache_release(page);
+ put_page(page);
}
if (mapped)
spin_unlock(&page->mapping->private_lock);
/* One for when we alloced the page */
- page_cache_release(page);
+ put_page(page);
} while (index != 0);
}
@@ -4797,24 +4673,14 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
return new;
}
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len)
{
struct extent_buffer *eb;
- unsigned long len;
unsigned long num_pages;
unsigned long i;
- if (!fs_info) {
- /*
- * Called only from tests that don't always have a fs_info
- * available, but we know that nodesize is 4096
- */
- len = 4096;
- } else {
- len = fs_info->tree_root->nodesize;
- }
- num_pages = num_extent_pages(0, len);
+ num_pages = num_extent_pages(start, len);
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
@@ -4837,6 +4703,24 @@ err:
return NULL;
}
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ unsigned long len;
+
+ if (!fs_info) {
+ /*
+ * Called only from tests that don't always have a fs_info
+ * available, but we know that nodesize is 4096
+ */
+ len = 4096;
+ } else {
+ len = fs_info->tree_root->nodesize;
+ }
+
+ return __alloc_dummy_extent_buffer(fs_info, start, len);
+}
+
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
@@ -4893,7 +4777,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> PAGE_CACHE_SHIFT);
+ start >> PAGE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
/*
@@ -4943,7 +4827,7 @@ again:
goto free_eb;
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_CACHE_SHIFT, eb);
+ start >> PAGE_SHIFT, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -4976,7 +4860,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
unsigned long len = fs_info->tree_root->nodesize;
unsigned long num_pages = num_extent_pages(start, len);
unsigned long i;
- unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
struct extent_buffer *eb;
struct extent_buffer *exists = NULL;
struct page *p;
@@ -5010,7 +4894,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (atomic_inc_not_zero(&exists->refs)) {
spin_unlock(&mapping->private_lock);
unlock_page(p);
- page_cache_release(p);
+ put_page(p);
mark_extent_buffer_accessed(exists, p);
goto free_eb;
}
@@ -5022,7 +4906,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
*/
ClearPagePrivate(p);
WARN_ON(PageDirty(p));
- page_cache_release(p);
+ put_page(p);
}
attach_extent_buffer_page(eb, p);
spin_unlock(&mapping->private_lock);
@@ -5045,7 +4929,7 @@ again:
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> PAGE_CACHE_SHIFT, eb);
+ start >> PAGE_SHIFT, eb);
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
@@ -5108,7 +4992,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_lock(&fs_info->buffer_lock);
radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> PAGE_CACHE_SHIFT);
+ eb->start >> PAGE_SHIFT);
spin_unlock(&fs_info->buffer_lock);
} else {
spin_unlock(&eb->refs_lock);
@@ -5227,7 +5111,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
return was_dirty;
}
-int clear_extent_buffer_uptodate(struct extent_buffer *eb)
+void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
@@ -5240,10 +5124,9 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
if (page)
ClearPageUptodate(page);
}
- return 0;
}
-int set_extent_buffer_uptodate(struct extent_buffer *eb)
+void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
unsigned long i;
struct page *page;
@@ -5255,7 +5138,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
page = eb->pages[i];
SetPageUptodate(page);
}
- return 0;
}
int extent_buffer_uptodate(struct extent_buffer *eb)
@@ -5284,8 +5166,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
if (start) {
WARN_ON(start < eb->start);
- start_i = (start >> PAGE_CACHE_SHIFT) -
- (eb->start >> PAGE_CACHE_SHIFT);
+ start_i = (start >> PAGE_SHIFT) -
+ (eb->start >> PAGE_SHIFT);
} else {
start_i = 0;
}
@@ -5368,18 +5250,18 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
struct page *page;
char *kaddr;
char *dst = (char *)dstv;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+ offset = (start_offset + start) & (PAGE_SIZE - 1);
while (len > 0) {
page = eb->pages[i];
- cur = min(len, (PAGE_CACHE_SIZE - offset));
+ cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
memcpy(dst, kaddr + offset, cur);
@@ -5399,19 +5281,19 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
struct page *page;
char *kaddr;
char __user *dst = (char __user *)dstv;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+ offset = (start_offset + start) & (PAGE_SIZE - 1);
while (len > 0) {
page = eb->pages[i];
- cur = min(len, (PAGE_CACHE_SIZE - offset));
+ cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
if (copy_to_user(dst, kaddr + offset, cur)) {
ret = -EFAULT;
@@ -5432,13 +5314,13 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
unsigned long *map_start,
unsigned long *map_len)
{
- size_t offset = start & (PAGE_CACHE_SIZE - 1);
+ size_t offset = start & (PAGE_SIZE - 1);
char *kaddr;
struct page *p;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
unsigned long end_i = (start_offset + start + min_len - 1) >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
if (i != end_i)
return -EINVAL;
@@ -5448,7 +5330,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
*map_start = 0;
} else {
offset = 0;
- *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+ *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
}
if (start + min_len > eb->len) {
@@ -5461,7 +5343,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
p = eb->pages[i];
kaddr = page_address(p);
*map = kaddr + offset;
- *map_len = PAGE_CACHE_SIZE - offset;
+ *map_len = PAGE_SIZE - offset;
return 0;
}
@@ -5474,19 +5356,19 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+ offset = (start_offset + start) & (PAGE_SIZE - 1);
while (len > 0) {
page = eb->pages[i];
- cur = min(len, (PAGE_CACHE_SIZE - offset));
+ cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
ret = memcmp(ptr, kaddr + offset, cur);
@@ -5509,19 +5391,19 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
struct page *page;
char *kaddr;
char *src = (char *)srcv;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+ offset = (start_offset + start) & (PAGE_SIZE - 1);
while (len > 0) {
page = eb->pages[i];
WARN_ON(!PageUptodate(page));
- cur = min(len, PAGE_CACHE_SIZE - offset);
+ cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
memcpy(kaddr + offset, src, cur);
@@ -5539,19 +5421,19 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_SHIFT;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+ offset = (start_offset + start) & (PAGE_SIZE - 1);
while (len > 0) {
page = eb->pages[i];
WARN_ON(!PageUptodate(page));
- cur = min(len, PAGE_CACHE_SIZE - offset);
+ cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
memset(kaddr + offset, c, cur);
@@ -5570,19 +5452,19 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
size_t offset;
struct page *page;
char *kaddr;
- size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
- unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+ size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+ unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
WARN_ON(src->len != dst_len);
offset = (start_offset + dst_offset) &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
while (len > 0) {
page = dst->pages[i];
WARN_ON(!PageUptodate(page));
- cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+ cur = min(len, (unsigned long)(PAGE_SIZE - offset));
kaddr = page_address(page);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
@@ -5594,6 +5476,155 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
}
}
+/*
+ * The extent buffer bitmap operations are done with byte granularity because
+ * bitmap items are not guaranteed to be aligned to a word and therefore a
+ * single word in a bitmap may straddle two pages in the extent buffer.
+ */
+#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
+#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BITMAP_FIRST_BYTE_MASK(start) \
+ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
+#define BITMAP_LAST_BYTE_MASK(nbits) \
+ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+
+/*
+ * eb_bitmap_offset() - calculate the page and offset of the byte containing the
+ * given bit number
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number
+ * @page_index: return index of the page in the extent buffer that contains the
+ * given bit number
+ * @page_offset: return offset into the page given by page_index
+ *
+ * This helper hides the ugliness of finding the byte in an extent buffer which
+ * contains a given bit.
+ */
+static inline void eb_bitmap_offset(struct extent_buffer *eb,
+ unsigned long start, unsigned long nr,
+ unsigned long *page_index,
+ size_t *page_offset)
+{
+ size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+ size_t byte_offset = BIT_BYTE(nr);
+ size_t offset;
+
+ /*
+ * The byte we want is the offset of the extent buffer + the offset of
+ * the bitmap item in the extent buffer + the offset of the byte in the
+ * bitmap item.
+ */
+ offset = start_offset + start + byte_offset;
+
+ *page_index = offset >> PAGE_SHIFT;
+ *page_offset = offset & (PAGE_SIZE - 1);
+}
+
+/**
+ * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number to test
+ */
+int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+ unsigned long nr)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+
+ eb_bitmap_offset(eb, start, nr, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
+}
+
+/**
+ * extent_buffer_bitmap_set - set an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to set
+ */
+void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+ const unsigned int size = pos + len;
+ int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+ unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
+
+ eb_bitmap_offset(eb, start, pos, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+
+ while (len >= bits_to_set) {
+ kaddr[offset] |= mask_to_set;
+ len -= bits_to_set;
+ bits_to_set = BITS_PER_BYTE;
+ mask_to_set = ~0U;
+ if (++offset >= PAGE_SIZE && len > 0) {
+ offset = 0;
+ page = eb->pages[++i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ }
+ }
+ if (len) {
+ mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+ kaddr[offset] |= mask_to_set;
+ }
+}
+
+
+/**
+ * extent_buffer_bitmap_clear - clear an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to clear
+ */
+void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len)
+{
+ char *kaddr;
+ struct page *page;
+ unsigned long i;
+ size_t offset;
+ const unsigned int size = pos + len;
+ int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+ unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
+
+ eb_bitmap_offset(eb, start, pos, &i, &offset);
+ page = eb->pages[i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+
+ while (len >= bits_to_clear) {
+ kaddr[offset] &= ~mask_to_clear;
+ len -= bits_to_clear;
+ bits_to_clear = BITS_PER_BYTE;
+ mask_to_clear = ~0U;
+ if (++offset >= PAGE_SIZE && len > 0) {
+ offset = 0;
+ page = eb->pages[++i];
+ WARN_ON(!PageUptodate(page));
+ kaddr = page_address(page);
+ }
+ }
+ if (len) {
+ mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
+ kaddr[offset] &= ~mask_to_clear;
+ }
+}
+
static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
{
unsigned long distance = (src > dst) ? src - dst : dst - src;
@@ -5628,7 +5659,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
- size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
unsigned long dst_i;
unsigned long src_i;
@@ -5647,17 +5678,17 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
while (len > 0) {
dst_off_in_page = (start_offset + dst_offset) &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
src_off_in_page = (start_offset + src_offset) &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
- dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
- src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+ dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
+ src_i = (start_offset + src_offset) >> PAGE_SHIFT;
- cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+ cur = min(len, (unsigned long)(PAGE_SIZE -
src_off_in_page));
cur = min_t(unsigned long, cur,
- (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+ (unsigned long)(PAGE_SIZE - dst_off_in_page));
copy_pages(dst->pages[dst_i], dst->pages[src_i],
dst_off_in_page, src_off_in_page, cur);
@@ -5676,7 +5707,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
size_t src_off_in_page;
unsigned long dst_end = dst_offset + len - 1;
unsigned long src_end = src_offset + len - 1;
- size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+ size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
unsigned long dst_i;
unsigned long src_i;
@@ -5695,13 +5726,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
return;
}
while (len > 0) {
- dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
- src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+ dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
+ src_i = (start_offset + src_end) >> PAGE_SHIFT;
dst_off_in_page = (start_offset + dst_end) &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
src_off_in_page = (start_offset + src_end) &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f4c1ae11855f..b5e0ade90e88 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -29,7 +29,6 @@
*/
#define EXTENT_BIO_COMPRESSED 1
#define EXTENT_BIO_TREE_LOG 2
-#define EXTENT_BIO_PARENT_LOCKED 4
#define EXTENT_BIO_FLAG_SHIFT 16
/* these are bit numbers for test/set bit */
@@ -62,6 +61,7 @@
struct extent_state;
struct btrfs_root;
struct btrfs_io_bio;
+struct io_failure_record;
typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
@@ -112,8 +112,7 @@ struct extent_state {
atomic_t refs;
unsigned state;
- /* for use by the FS */
- u64 private;
+ struct io_failure_record *failrec;
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
@@ -121,7 +120,7 @@ struct extent_state {
};
#define INLINE_EXTENT_BUFFER_PAGES 16
-#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
struct extent_buffer {
u64 start;
unsigned long len;
@@ -199,17 +198,17 @@ int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, struct extent_state **cached);
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
-int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached, gfp_t mask);
+ struct extent_state **cached);
+
+static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return lock_extent_bits(tree, start, end, NULL);
+}
+
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
- get_extent_t *get_extent, int mirror_num);
int __init extent_io_init(void);
void extent_io_exit(void);
@@ -221,39 +220,105 @@ void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int filled,
struct extent_state *cached_state);
-int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask,
struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
-int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, gfp_t mask);
+
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+ GFP_NOFS);
+}
+
+static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ mask);
+}
+
+static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits, gfp_t mask)
+{
+ int wake = 0;
+
+ if (bits & EXTENT_LOCKED)
+ wake = 1;
+
+ return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
+}
+
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask,
struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
-int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
-int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
-int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
+
+static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
+ u64 end, unsigned bits, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, mask);
+}
+
+static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ cached_state, mask);
+}
+
+static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
+ NULL, mask);
+}
+
+static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+}
+
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned clear_bits,
struct extent_state **cached_state, gfp_t mask);
-int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
-int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE,
+ NULL, cached_state, mask);
+}
+
+static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end,
+ EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+ NULL, cached_state, mask);
+}
+
+static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
+ u64 end, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask);
+}
+
+static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+ cached_state, mask);
+}
+
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state);
@@ -277,13 +342,14 @@ int extent_readpages(struct extent_io_tree *tree,
get_extent_t get_extent);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent);
-int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start);
+ u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
@@ -299,8 +365,8 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
static inline unsigned long num_extent_pages(u64 start, u64 len)
{
- return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
- (start >> PAGE_CACHE_SHIFT);
+ return ((start + len + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+ (start >> PAGE_SHIFT);
}
static inline void extent_buffer_get(struct extent_buffer *eb)
@@ -328,19 +394,25 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
void memset_extent_buffer(struct extent_buffer *eb, char c,
unsigned long start, unsigned long len);
+int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos);
+void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len);
+void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+ unsigned long pos, unsigned long len);
void clear_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+void set_extent_buffer_uptodate(struct extent_buffer *eb);
+void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
unsigned long *map_len);
-int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
unsigned bits_to_clear,
unsigned long page_ops);
@@ -357,7 +429,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
int mirror_num);
int clean_io_failure(struct inode *inode, u64 start, struct page *page,
unsigned int pg_offset);
-int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6a98bddd8f33..318b048eb254 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -4,6 +4,7 @@
#include <linux/hardirq.h>
#include "ctree.h"
#include "extent_map.h"
+#include "compression.h"
static struct kmem_cache *extent_map_cache;
@@ -20,8 +21,7 @@ int __init extent_map_init(void)
void extent_map_exit(void)
{
- if (extent_map_cache)
- kmem_cache_destroy(extent_map_cache);
+ kmem_cache_destroy(extent_map_cache);
}
/**
@@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void)
/**
* free_extent_map - drop reference count of an extent_map
- * @em: extent map beeing releasead
+ * @em: extent map being releasead
*
* Drops the reference out on @em by one and free the structure
* if the reference count hits zero.
@@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em)
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- kfree(em->bdev);
+ kfree(em->map_lookup);
kmem_cache_free(extent_map_cache, em);
}
}
@@ -422,7 +422,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
/**
* remove_extent_mapping - removes an extent_map from the extent tree
* @tree: extent tree to remove from
- * @em: extent map beeing removed
+ * @em: extent map being removed
*
* Removes @em from @tree. No reference counts are dropped, and no checks
* are done to see if the range is in use
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b2991fd8583e..eb8b8fae036b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -32,7 +32,15 @@ struct extent_map {
u64 block_len;
u64 generation;
unsigned long flags;
- struct block_device *bdev;
+ union {
+ struct block_device *bdev;
+
+ /*
+ * used for chunk mappings
+ * flags & EXTENT_FLAG_FS_MAPPING must be set
+ */
+ struct map_lookup *map_lookup;
+ };
atomic_t refs;
unsigned int compress_type;
struct list_head list;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 58ece6558430..7a7d6e253cfc 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,13 +25,14 @@
#include "transaction.h"
#include "volumes.h"
#include "print-tree.h"
+#include "compression.h"
#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
size) - 1))
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
- PAGE_CACHE_SIZE))
+ PAGE_SIZE))
#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
sizeof(struct btrfs_ordered_sum)) / \
@@ -172,6 +173,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u64 disk_bytenr;
+ u64 page_bytes_left;
u32 diff;
int nblocks;
int bio_index = 0;
@@ -201,8 +203,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
csum = (u8 *)dst;
}
- if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
- path->reada = 2;
+ if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
+ path->reada = READA_FORWARD;
WARN_ON(bio->bi_vcnt <= 0);
@@ -220,6 +222,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
if (dio)
offset = logical_offset;
+
+ page_bytes_left = bvec->bv_len;
while (bio_index < bio->bi_vcnt) {
if (!dio)
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
@@ -243,7 +247,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
if (BTRFS_I(inode)->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
set_extent_bits(io_tree, offset,
- offset + bvec->bv_len - 1,
+ offset + root->sectorsize - 1,
EXTENT_NODATASUM, GFP_NOFS);
} else {
btrfs_info(BTRFS_I(inode)->root->fs_info,
@@ -281,13 +285,29 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
found:
csum += count * csum_size;
nblocks -= count;
- bio_index += count;
+
while (count--) {
- disk_bytenr += bvec->bv_len;
- offset += bvec->bv_len;
- bvec++;
+ disk_bytenr += root->sectorsize;
+ offset += root->sectorsize;
+ page_bytes_left -= root->sectorsize;
+ if (!page_bytes_left) {
+ bio_index++;
+ /*
+ * make sure we're still inside the
+ * bio before we update page_bytes_left
+ */
+ if (bio_index >= bio->bi_vcnt) {
+ WARN_ON_ONCE(count);
+ goto done;
+ }
+ bvec++;
+ page_bytes_left = bvec->bv_len;
+ }
+
}
}
+
+done:
btrfs_free_path(path);
return 0;
}
@@ -328,7 +348,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (search_commit) {
path->skip_locking = 1;
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
}
@@ -432,6 +452,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
int index;
+ int nr_sectors;
+ int i;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
u64 offset;
@@ -459,41 +481,56 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
if (!contig)
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
- if (offset >= ordered->file_offset + ordered->len ||
- offset < ordered->file_offset) {
- unsigned long bytes_left;
- sums->len = this_sum_bytes;
- this_sum_bytes = 0;
- btrfs_add_ordered_sum(inode, ordered, sums);
- btrfs_put_ordered_extent(ordered);
+ data = kmap_atomic(bvec->bv_page);
- bytes_left = bio->bi_iter.bi_size - total_bytes;
+ nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+ bvec->bv_len + root->sectorsize
+ - 1);
+
+ for (i = 0; i < nr_sectors; i++) {
+ if (offset >= ordered->file_offset + ordered->len ||
+ offset < ordered->file_offset) {
+ unsigned long bytes_left;
+
+ kunmap_atomic(data);
+ sums->len = this_sum_bytes;
+ this_sum_bytes = 0;
+ btrfs_add_ordered_sum(inode, ordered, sums);
+ btrfs_put_ordered_extent(ordered);
+
+ bytes_left = bio->bi_iter.bi_size - total_bytes;
+
+ sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+ GFP_NOFS);
+ BUG_ON(!sums); /* -ENOMEM */
+ sums->len = bytes_left;
+ ordered = btrfs_lookup_ordered_extent(inode,
+ offset);
+ ASSERT(ordered); /* Logic error */
+ sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9)
+ + total_bytes;
+ index = 0;
+
+ data = kmap_atomic(bvec->bv_page);
+ }
- sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
- GFP_NOFS);
- BUG_ON(!sums); /* -ENOMEM */
- sums->len = bytes_left;
- ordered = btrfs_lookup_ordered_extent(inode, offset);
- BUG_ON(!ordered); /* Logic error */
- sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) +
- total_bytes;
- index = 0;
+ sums->sums[index] = ~(u32)0;
+ sums->sums[index]
+ = btrfs_csum_data(data + bvec->bv_offset
+ + (i * root->sectorsize),
+ sums->sums[index],
+ root->sectorsize);
+ btrfs_csum_final(sums->sums[index],
+ (char *)(sums->sums + index));
+ index++;
+ offset += root->sectorsize;
+ this_sum_bytes += root->sectorsize;
+ total_bytes += root->sectorsize;
}
- data = kmap_atomic(bvec->bv_page);
- sums->sums[index] = ~(u32)0;
- sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
- sums->sums[index],
- bvec->bv_len);
kunmap_atomic(data);
- btrfs_csum_final(sums->sums[index],
- (char *)(sums->sums + index));
bio_index++;
- index++;
- total_bytes += bvec->bv_len;
- this_sum_bytes += bvec->bv_len;
- offset += bvec->bv_len;
bvec++;
}
this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6bd5ce9d75f0..8d7b5a45c005 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
#include "locking.h"
#include "volumes.h"
#include "qgroup.h"
+#include "compression.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -406,19 +407,18 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
*/
-static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
- size_t write_bytes,
+static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
struct page **prepared_pages,
struct iov_iter *i)
{
size_t copied = 0;
size_t total_copied = 0;
int pg = 0;
- int offset = pos & (PAGE_CACHE_SIZE - 1);
+ int offset = pos & (PAGE_SIZE - 1);
while (write_bytes > 0) {
size_t count = min_t(size_t,
- PAGE_CACHE_SIZE - offset, write_bytes);
+ PAGE_SIZE - offset, write_bytes);
struct page *page = prepared_pages[pg];
/*
* Copy data from userspace to the current page
@@ -448,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
if (unlikely(copied == 0))
break;
- if (copied < PAGE_CACHE_SIZE - offset) {
+ if (copied < PAGE_SIZE - offset) {
offset += copied;
} else {
pg++;
@@ -473,7 +473,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
*/
ClearPageChecked(pages[i]);
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
}
@@ -499,7 +499,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
loff_t isize = i_size_read(inode);
start_pos = pos & ~((u64)root->sectorsize - 1);
- num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
+ num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize);
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -756,8 +756,16 @@ next_slot:
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid > ino ||
- key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+
+ if (key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(key.objectid < ino) ||
+ key.type < BTRFS_EXTENT_DATA_KEY) {
+ ASSERT(del_nr == 0);
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
break;
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -776,8 +784,8 @@ next_slot:
btrfs_file_extent_inline_len(leaf,
path->slots[0], fi);
} else {
- WARN_ON(1);
- extent_end = search_start;
+ /* can't happen */
+ BUG();
}
/*
@@ -1283,12 +1291,13 @@ out:
* on error we return an unlocked page and the error value
* on success we return a locked page and 0
*/
-static int prepare_uptodate_page(struct page *page, u64 pos,
+static int prepare_uptodate_page(struct inode *inode,
+ struct page *page, u64 pos,
bool force_uptodate)
{
int ret = 0;
- if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
+ if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
if (ret)
@@ -1298,6 +1307,10 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
unlock_page(page);
return -EIO;
}
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
}
return 0;
}
@@ -1310,12 +1323,13 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
size_t write_bytes, bool force_uptodate)
{
int i;
- unsigned long index = pos >> PAGE_CACHE_SHIFT;
+ unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int err = 0;
int faili;
for (i = 0; i < num_pages; i++) {
+again:
pages[i] = find_or_create_page(inode->i_mapping, index + i,
mask | __GFP_WRITE);
if (!pages[i]) {
@@ -1325,13 +1339,17 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
}
if (i == 0)
- err = prepare_uptodate_page(pages[i], pos,
+ err = prepare_uptodate_page(inode, pages[i], pos,
force_uptodate);
- if (i == num_pages - 1)
- err = prepare_uptodate_page(pages[i],
+ if (!err && i == num_pages - 1)
+ err = prepare_uptodate_page(inode, pages[i],
pos + write_bytes, false);
if (err) {
- page_cache_release(pages[i]);
+ put_page(pages[i]);
+ if (err == -EAGAIN) {
+ err = 0;
+ goto again;
+ }
faili = i - 1;
goto fail;
}
@@ -1342,7 +1360,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
fail:
while (faili >= 0) {
unlock_page(pages[faili]);
- page_cache_release(pages[faili]);
+ put_page(pages[faili]);
faili--;
}
return err;
@@ -1362,21 +1380,24 @@ fail:
static noinline int
lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
size_t num_pages, loff_t pos,
+ size_t write_bytes,
u64 *lockstart, u64 *lockend,
struct extent_state **cached_state)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
u64 start_pos;
u64 last_pos;
int i;
int ret = 0;
- start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
- last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
+ start_pos = round_down(pos, root->sectorsize);
+ last_pos = start_pos
+ + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1;
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos, 0, cached_state);
+ start_pos, last_pos, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start_pos,
last_pos - start_pos + 1);
if (ordered &&
@@ -1387,7 +1408,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
cached_state, GFP_NOFS);
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
@@ -1476,8 +1497,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
bool force_page_uptodate = false;
bool need_unlock;
- nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
- PAGE_CACHE_SIZE / (sizeof(struct page *)));
+ nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
+ PAGE_SIZE / (sizeof(struct page *)));
nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
nrptrs = max(nrptrs, 8);
pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
@@ -1485,15 +1506,18 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
return -ENOMEM;
while (iov_iter_count(i) > 0) {
- size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+ size_t offset = pos & (PAGE_SIZE - 1);
+ size_t sector_offset;
size_t write_bytes = min(iov_iter_count(i),
- nrptrs * (size_t)PAGE_CACHE_SIZE -
+ nrptrs * (size_t)PAGE_SIZE -
offset);
size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
size_t reserve_bytes;
size_t dirty_pages;
size_t copied;
+ size_t dirty_sectors;
+ size_t num_sectors;
WARN_ON(num_pages > nrptrs);
@@ -1506,29 +1530,29 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
break;
}
- reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+ sector_offset = pos & (root->sectorsize - 1);
+ reserve_bytes = round_up(write_bytes + sector_offset,
+ root->sectorsize);
- if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC)) {
- ret = check_can_nocow(inode, pos, &write_bytes);
- if (ret < 0)
- break;
- if (ret > 0) {
- /*
- * For nodata cow case, no need to reserve
- * data space.
- */
- only_release_metadata = true;
- /*
- * our prealloc extent may be smaller than
- * write_bytes, so scale down.
- */
- num_pages = DIV_ROUND_UP(write_bytes + offset,
- PAGE_CACHE_SIZE);
- reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- goto reserve_metadata;
- }
+ if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) &&
+ check_can_nocow(inode, pos, &write_bytes) > 0) {
+ /*
+ * For nodata cow case, no need to reserve
+ * data space.
+ */
+ only_release_metadata = true;
+ /*
+ * our prealloc extent may be smaller than
+ * write_bytes, so scale down.
+ */
+ num_pages = DIV_ROUND_UP(write_bytes + offset,
+ PAGE_SIZE);
+ reserve_bytes = round_up(write_bytes + sector_offset,
+ root->sectorsize);
+ goto reserve_metadata;
}
+
ret = btrfs_check_data_free_space(inode, pos, write_bytes);
if (ret < 0)
break;
@@ -1559,8 +1583,8 @@ again:
break;
ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
- pos, &lockstart, &lockend,
- &cached_state);
+ pos, write_bytes, &lockstart,
+ &lockend, &cached_state);
if (ret < 0) {
if (ret == -EAGAIN)
goto again;
@@ -1570,8 +1594,7 @@ again:
ret = 0;
}
- copied = btrfs_copy_from_user(pos, num_pages,
- write_bytes, pages, i);
+ copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
/*
* if we have trouble faulting in the pages, fall
@@ -1586,7 +1609,7 @@ again:
} else {
force_page_uptodate = false;
dirty_pages = DIV_ROUND_UP(copied + offset,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
}
/*
@@ -1596,9 +1619,16 @@ again:
* we still have an outstanding extent for the chunk we actually
* managed to copy.
*/
- if (num_pages > dirty_pages) {
- release_bytes = (num_pages - dirty_pages) <<
- PAGE_CACHE_SHIFT;
+ num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+ reserve_bytes);
+ dirty_sectors = round_up(copied + sector_offset,
+ root->sectorsize);
+ dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info,
+ dirty_sectors);
+
+ if (num_sectors > dirty_sectors) {
+ release_bytes = (write_bytes - copied)
+ & ~((u64)root->sectorsize - 1);
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
@@ -1611,13 +1641,14 @@ again:
u64 __pos;
__pos = round_down(pos, root->sectorsize) +
- (dirty_pages << PAGE_CACHE_SHIFT);
+ (dirty_pages << PAGE_SHIFT);
btrfs_delalloc_release_space(inode, __pos,
release_bytes);
}
}
- release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
+ release_bytes = round_up(copied + sector_offset,
+ root->sectorsize);
if (copied > 0)
ret = btrfs_dirty_pages(root, inode, pages,
@@ -1638,8 +1669,7 @@ again:
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos, root->sectorsize);
- lockend = lockstart +
- (dirty_pages << PAGE_CACHE_SHIFT) - 1;
+ lockend = round_up(pos + copied, root->sectorsize) - 1;
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_NORESERVE, NULL,
@@ -1652,7 +1682,7 @@ again:
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
- if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
+ if (dirty_pages < (root->nodesize >> PAGE_SHIFT) + 1)
btrfs_btree_balance_dirty(root);
pos += copied;
@@ -1708,8 +1738,8 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
goto out;
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
- invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
- endbyte >> PAGE_CACHE_SHIFT);
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
out:
return written ? written : err;
}
@@ -1745,18 +1775,20 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
ssize_t err;
loff_t pos;
size_t count;
+ loff_t oldsize;
+ int clean_page = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = generic_write_checks(iocb, from);
if (err <= 0) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
current->backing_dev_info = inode_to_bdi(inode);
err = file_remove_privs(file);
if (err) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
@@ -1767,7 +1799,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
* to stop this write operation to ensure FS consistency.
*/
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
err = -EROFS;
goto out;
}
@@ -1783,14 +1815,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
pos = iocb->ki_pos;
count = iov_iter_count(from);
start_pos = round_down(pos, root->sectorsize);
- if (start_pos > i_size_read(inode)) {
+ oldsize = i_size_read(inode);
+ if (start_pos > oldsize) {
/* Expand hole size to cover write data, preventing empty gap */
end_pos = round_up(pos + count, root->sectorsize);
- err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
+ err = btrfs_cont_expand(inode, oldsize, end_pos);
if (err) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
+ if (start_pos > round_up(oldsize, root->sectorsize))
+ clean_page = 1;
}
if (sync)
@@ -1802,14 +1837,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
num_written = __btrfs_buffered_write(file, from, pos);
if (num_written > 0)
iocb->ki_pos = pos + num_written;
+ if (clean_page)
+ pagecache_isize_extended(inode, oldsize,
+ i_size_read(inode));
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/*
* We also have to set last_sub_trans to the current log transid,
* otherwise subsequent syncs to a file that's been synced in this
- * transaction will appear to have already occured.
+ * transaction will appear to have already occurred.
*/
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_sub_trans = root->log_transid;
@@ -1867,15 +1905,20 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
*/
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
- struct dentry *dentry = file->f_path.dentry;
+ struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0;
bool full_sync = 0;
- const u64 len = end - start + 1;
+ u64 len;
+ /*
+ * The range length can be represented by u64, we have to do the typecasts
+ * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
+ */
+ len = (u64)end - (u64)start + 1;
trace_btrfs_sync_file(file, datasync);
/*
@@ -1888,7 +1931,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
atomic_inc(&root->log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -1940,7 +1983,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = start_ordered_ops(inode, start, end);
}
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
atomic_inc(&root->log_batch);
@@ -1975,10 +2018,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
smp_mb();
if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- (BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed &&
- (full_sync ||
- !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
+ (full_sync && BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed) ||
+ (!btrfs_have_ordered_extents_in_range(inode, start, len) &&
+ BTRFS_I(inode)->last_trans
+ <= root->fs_info->last_trans_committed)) {
/*
* We'v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -1986,7 +2030,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
@@ -2010,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
trans->sync = true;
@@ -2033,7 +2077,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/*
* If any of the ordered extents had an error, just return it to user
@@ -2063,8 +2107,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
}
if (!full_sync) {
- ret = btrfs_wait_ordered_range(inode, start,
- end - start + 1);
+ ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
btrfs_end_transaction(trans, root);
goto out;
@@ -2273,18 +2316,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
int ret = 0;
int err = 0;
unsigned int rsv_count;
- bool same_page;
+ bool same_block;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
u64 ino_size;
- bool truncated_page = false;
+ bool truncated_block = false;
bool updated_inode = false;
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
- ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+ inode_lock(inode);
+ ino_size = round_up(inode->i_size, root->sectorsize);
ret = find_first_non_hole(inode, &offset, &len);
if (ret < 0)
goto out_only_mutex;
@@ -2297,33 +2340,32 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
lockend = round_down(offset + len,
BTRFS_I(inode)->root->sectorsize) - 1;
- same_page = ((offset >> PAGE_CACHE_SHIFT) ==
- ((offset + len - 1) >> PAGE_CACHE_SHIFT));
-
+ same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset))
+ == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1));
/*
- * We needn't truncate any page which is beyond the end of the file
+ * We needn't truncate any block which is beyond the end of the file
* because we are sure there is no data there.
*/
/*
- * Only do this if we are in the same page and we aren't doing the
- * entire page.
+ * Only do this if we are in the same block and we aren't doing the
+ * entire block.
*/
- if (same_page && len < PAGE_CACHE_SIZE) {
+ if (same_block && len < root->sectorsize) {
if (offset < ino_size) {
- truncated_page = true;
- ret = btrfs_truncate_page(inode, offset, len, 0);
+ truncated_block = true;
+ ret = btrfs_truncate_block(inode, offset, len, 0);
} else {
ret = 0;
}
goto out_only_mutex;
}
- /* zero back part of the first page */
+ /* zero back part of the first block */
if (offset < ino_size) {
- truncated_page = true;
- ret = btrfs_truncate_page(inode, offset, 0, 0);
+ truncated_block = true;
+ ret = btrfs_truncate_block(inode, offset, 0, 0);
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
}
@@ -2356,9 +2398,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (!ret) {
/* zero the front end of the last page */
if (tail_start + tail_len < ino_size) {
- truncated_page = true;
- ret = btrfs_truncate_page(inode,
- tail_start + tail_len, 0, 1);
+ truncated_block = true;
+ ret = btrfs_truncate_block(inode,
+ tail_start + tail_len,
+ 0, 1);
if (ret)
goto out_only_mutex;
}
@@ -2376,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, &cached_state);
+ &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
/*
@@ -2399,7 +2442,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
ret = btrfs_wait_ordered_range(inode, lockstart,
lockend - lockstart + 1);
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
}
@@ -2524,7 +2567,7 @@ out_trans:
goto out_free;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
trans->block_rsv = &root->fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
@@ -2538,7 +2581,7 @@ out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
out_only_mutex:
- if (!updated_inode && truncated_page && !ret && !err) {
+ if (!updated_inode && truncated_block && !ret && !err) {
/*
* If we only end up zeroing part of a page, we still need to
* update the inode item, so that all the time fields are
@@ -2554,7 +2597,7 @@ out_only_mutex:
ret = btrfs_end_transaction(trans, root);
}
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret && !err)
err = ret;
return err;
@@ -2591,7 +2634,7 @@ static int add_falloc_range(struct list_head *head, u64 start, u64 len)
return 0;
}
insert:
- range = kmalloc(sizeof(*range), GFP_NOFS);
+ range = kmalloc(sizeof(*range), GFP_KERNEL);
if (!range)
return -ENOMEM;
range->start = start;
@@ -2638,10 +2681,13 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret < 0)
return ret;
- mutex_lock(&inode->i_mutex);
- ret = inode_newsize_ok(inode, alloc_end);
- if (ret)
- goto out;
+ inode_lock(inode);
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
+ ret = inode_newsize_ok(inode, offset + len);
+ if (ret)
+ goto out;
+ }
/*
* TODO: Move these two operations after we have checked
@@ -2658,10 +2704,10 @@ static long btrfs_fallocate(struct file *file, int mode,
} else if (offset + len > inode->i_size) {
/*
* If we are fallocating from the end of the file onward we
- * need to zero out the end of the page if i_size lands in the
- * middle of a page.
+ * need to zero out the end of the block if i_size lands in the
+ * middle of a block.
*/
- ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
if (ret)
goto out;
}
@@ -2683,7 +2729,7 @@ static long btrfs_fallocate(struct file *file, int mode,
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
- locked_end, 0, &cached_state);
+ locked_end, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(inode,
alloc_end - 1);
if (ordered &&
@@ -2692,7 +2738,7 @@ static long btrfs_fallocate(struct file *file, int mode,
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
alloc_start, locked_end,
- &cached_state, GFP_NOFS);
+ &cached_state, GFP_KERNEL);
/*
* we can't wait on the range with the transaction
* running or with the extent lock held
@@ -2774,7 +2820,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
} else {
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
i_size_write(inode, actual_end);
btrfs_ordered_update_i_size(inode, actual_end, NULL);
ret = btrfs_update_inode(trans, root, inode);
@@ -2786,7 +2832,7 @@ static long btrfs_fallocate(struct file *file, int mode,
}
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state, GFP_NOFS);
+ &cached_state, GFP_KERNEL);
out:
/*
* As we waited the extent range, the data_rsv_map must be empty
@@ -2796,7 +2842,7 @@ out:
* So this is completely used as cleanup.
*/
btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/* Let go of our reservation. */
btrfs_free_reserved_data_space(inode, alloc_start,
alloc_end - alloc_start);
@@ -2830,7 +2876,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
lockend--;
len = lockend - lockstart + 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
while (start < inode->i_size) {
@@ -2872,7 +2918,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file->f_mapping->host;
int ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
switch (whence) {
case SEEK_END:
case SEEK_CUR:
@@ -2881,20 +2927,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
case SEEK_DATA:
case SEEK_HOLE:
if (offset >= i_size_read(inode)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -ENXIO;
}
ret = find_desired_extent(inode, &offset, whence);
if (ret) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
}
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return offset;
}
@@ -2912,12 +2958,14 @@ const struct file_operations btrfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = btrfs_ioctl,
#endif
+ .copy_file_range = btrfs_copy_file_range,
+ .clone_file_range = btrfs_clone_file_range,
+ .dedupe_file_range = btrfs_dedupe_file_range,
};
void btrfs_auto_defrag_exit(void)
{
- if (btrfs_inode_defrag_cachep)
- kmem_cache_destroy(btrfs_inode_defrag_cachep);
+ kmem_cache_destroy(btrfs_inode_defrag_cachep);
}
int btrfs_auto_defrag_init(void)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0948d34cb84a..5e6062c26129 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -29,8 +29,8 @@
#include "inode-map.h"
#include "volumes.h"
-#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
-#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
+#define BITS_PER_BITMAP (PAGE_SIZE * 8)
+#define MAX_CACHE_BYTES_PER_GIG SZ_32K
struct btrfs_trim_range {
u64 start;
@@ -85,8 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
}
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) &
- ~(__GFP_FS | __GFP_HIGHMEM));
+ mapping_gfp_constraint(inode->i_mapping,
+ ~(__GFP_FS | __GFP_HIGHMEM)));
return inode;
}
@@ -295,7 +295,7 @@ static int readahead_cache(struct inode *inode)
return -ENOMEM;
file_ra_state_init(ra, inode->i_mapping);
- last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
@@ -310,14 +310,14 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
int num_pages;
int check_crcs = 0;
- num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+ num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
check_crcs = 1;
/* Make sure we can fit our crcs into the first page */
if (write && check_crcs &&
- (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+ (num_pages * sizeof(u32)) >= PAGE_SIZE)
return -ENOSPC;
memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
@@ -354,9 +354,9 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
io_ctl->page = io_ctl->pages[io_ctl->index++];
io_ctl->cur = page_address(io_ctl->page);
io_ctl->orig = io_ctl->cur;
- io_ctl->size = PAGE_CACHE_SIZE;
+ io_ctl->size = PAGE_SIZE;
if (clear)
- memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+ memset(io_ctl->cur, 0, PAGE_SIZE);
}
static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
@@ -369,7 +369,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
if (io_ctl->pages[i]) {
ClearPageChecked(io_ctl->pages[i]);
unlock_page(io_ctl->pages[i]);
- page_cache_release(io_ctl->pages[i]);
+ put_page(io_ctl->pages[i]);
}
}
}
@@ -475,7 +475,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
offset = sizeof(u32) * io_ctl->num_pages;
crc = btrfs_csum_data(io_ctl->orig + offset, crc,
- PAGE_CACHE_SIZE - offset);
+ PAGE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
io_ctl_unmap_page(io_ctl);
tmp = page_address(io_ctl->pages[0]);
@@ -503,7 +503,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
io_ctl_map_page(io_ctl, 0);
crc = btrfs_csum_data(io_ctl->orig + offset, crc,
- PAGE_CACHE_SIZE - offset);
+ PAGE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
if (val != crc) {
btrfs_err_rl(io_ctl->root->fs_info,
@@ -561,7 +561,7 @@ static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
io_ctl_map_page(io_ctl, 0);
}
- memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+ memcpy(io_ctl->cur, bitmap, PAGE_SIZE);
io_ctl_set_crc(io_ctl, io_ctl->index - 1);
if (io_ctl->index < io_ctl->num_pages)
io_ctl_map_page(io_ctl, 0);
@@ -621,7 +621,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
if (ret)
return ret;
- memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+ memcpy(entry->bitmap, io_ctl->cur, PAGE_SIZE);
io_ctl_unmap_page(io_ctl);
return 0;
@@ -775,7 +775,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
} else {
ASSERT(num_bitmaps);
num_bitmaps--;
- e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
if (!e->bitmap) {
kmem_cache_free(
btrfs_free_space_cachep, e);
@@ -891,7 +891,7 @@ out:
spin_unlock(&block_group->lock);
ret = 0;
- btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
+ btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuilding it now",
block_group->key.objectid);
}
@@ -1086,14 +1086,11 @@ write_pinned_extent_entries(struct btrfs_root *root,
static noinline_for_stack int
write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
{
- struct list_head *pos, *n;
+ struct btrfs_free_space *entry, *next;
int ret;
/* Write out the bitmaps */
- list_for_each_safe(pos, n, bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
-
+ list_for_each_entry_safe(entry, next, bitmap_list, list) {
ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
if (ret)
return -ENOSPC;
@@ -1119,13 +1116,10 @@ static int flush_dirty_cache(struct inode *inode)
static void noinline_for_stack
cleanup_bitmap_list(struct list_head *bitmap_list)
{
- struct list_head *pos, *n;
+ struct btrfs_free_space *entry, *next;
- list_for_each_safe(pos, n, bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
+ list_for_each_entry_safe(entry, next, bitmap_list, list)
list_del_init(&entry->list);
- }
}
static void noinline_for_stack
@@ -1261,7 +1255,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
goto out;
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- 0, &cached_state);
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1656,18 +1650,17 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
* at or below 32k, so we need to adjust how much memory we allow to be
* used by extent based free space tracking
*/
- if (size < 1024 * 1024 * 1024)
+ if (size < SZ_1G)
max_bytes = MAX_CACHE_BYTES_PER_GIG;
else
- max_bytes = MAX_CACHE_BYTES_PER_GIG *
- div_u64(size, 1024 * 1024 * 1024);
+ max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
/*
* we want to account for 1 more bitmap than what we have so we can make
* sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
* we add more bitmaps.
*/
- bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+ bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_SIZE;
if (bitmap_bytes >= max_bytes) {
ctl->extents_thresh = 0;
@@ -2016,7 +2009,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
return true;
}
-static struct btrfs_free_space_op free_space_op = {
+static const struct btrfs_free_space_op free_space_op = {
.recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -2118,7 +2111,7 @@ new_bitmap:
}
/* allocate the bitmap */
- info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
spin_lock(&ctl->tree_lock);
if (!info->bitmap) {
ret = -ENOMEM;
@@ -2489,8 +2482,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
* track of free space, and if we pass 1/2 of that we want to
* start converting things over to using bitmaps
*/
- ctl->extents_thresh = ((1024 * 32) / 2) /
- sizeof(struct btrfs_free_space);
+ ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
}
/*
@@ -2972,7 +2964,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct btrfs_free_space *entry;
+ struct btrfs_free_space *entry = NULL;
int ret = -ENOSPC;
u64 bitmap_offset = offset_to_bitmap(ctl, offset);
@@ -2983,8 +2975,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
* The bitmap that covers offset won't be in the list unless offset
* is just its start offset.
*/
- entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
- if (entry->offset != bitmap_offset) {
+ if (!list_empty(bitmaps))
+ entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+
+ if (!entry || entry->offset != bitmap_offset) {
entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
if (entry && list_empty(&entry->list))
list_add(&entry->list, bitmaps);
@@ -3586,7 +3580,7 @@ again:
}
if (!map) {
- map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ map = kzalloc(PAGE_SIZE, GFP_NOFS);
if (!map) {
kmem_cache_free(btrfs_free_space_cachep, info);
return -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index f251865eb6f3..33178c490ace 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -37,7 +37,7 @@ struct btrfs_free_space_ctl {
int total_bitmaps;
int unit;
u64 start;
- struct btrfs_free_space_op *op;
+ const struct btrfs_free_space_op *op;
void *private;
struct mutex cache_writeout_mutex;
struct list_head trimming_ranges;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
new file mode 100644
index 000000000000..53dbeaf6ce94
--- /dev/null
+++ b/fs/btrfs/free-space-tree.c
@@ -0,0 +1,1605 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "free-space-tree.h"
+#include "transaction.h"
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+
+void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
+{
+ u32 bitmap_range;
+ size_t bitmap_size;
+ u64 num_bitmaps, total_bitmap_size;
+
+ /*
+ * We convert to bitmaps when the disk space required for using extents
+ * exceeds that required for using bitmaps.
+ */
+ bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1,
+ bitmap_range);
+ bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
+ total_bitmap_size = num_bitmaps * bitmap_size;
+ cache->bitmap_high_thresh = div_u64(total_bitmap_size,
+ sizeof(struct btrfs_item));
+
+ /*
+ * We allow for a small buffer between the high threshold and low
+ * threshold to avoid thrashing back and forth between the two formats.
+ */
+ if (cache->bitmap_high_thresh > 100)
+ cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100;
+ else
+ cache->bitmap_low_thresh = 0;
+}
+
+static int add_new_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_FREE_SPACE_INFO_KEY;
+ key.offset = block_group->key.offset;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ info = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_info);
+ btrfs_set_free_space_extent_count(leaf, info, 0);
+ btrfs_set_free_space_flags(leaf, info, 0);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+}
+
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, int cow)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_FREE_SPACE_INFO_KEY;
+ key.offset = block_group->key.offset;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret != 0) {
+ btrfs_warn(fs_info, "missing free space info for %llu\n",
+ block_group->key.objectid);
+ ASSERT(0);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_free_space_info);
+}
+
+/*
+ * btrfs_search_slot() but we're looking for the greatest key less than the
+ * passed key.
+ */
+static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_key *key, struct btrfs_path *p,
+ int ins_len, int cow)
+{
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, p, ins_len, cow);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0) {
+ ASSERT(0);
+ return -EIO;
+ }
+
+ if (p->slots[0] == 0) {
+ ASSERT(0);
+ return -EIO;
+ }
+ p->slots[0]--;
+
+ return 0;
+}
+
+static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
+{
+ return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
+}
+
+static unsigned long *alloc_bitmap(u32 bitmap_size)
+{
+ void *mem;
+
+ /*
+ * The allocation size varies, observed numbers were < 4K up to 16K.
+ * Using vmalloc unconditionally would be too heavy, we'll try
+ * contiguous allocations first.
+ */
+ if (bitmap_size <= PAGE_SIZE)
+ return kzalloc(bitmap_size, GFP_NOFS);
+
+ mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN);
+ if (mem)
+ return mem;
+
+ return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+}
+
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ unsigned long *bitmap;
+ char *bitmap_cursor;
+ u64 start, end;
+ u64 bitmap_range, i;
+ u32 bitmap_size, flags, expected_extent_count;
+ u32 extent_count = 0;
+ int done = 0, nr;
+ int ret;
+
+ bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ block_group->sectorsize);
+ bitmap = alloc_bitmap(bitmap_size);
+ if (!bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
+ u64 first, last;
+
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+
+ first = div_u64(found_key.objectid - start,
+ block_group->sectorsize);
+ last = div_u64(found_key.objectid + found_key.offset - start,
+ block_group->sectorsize);
+ bitmap_set(bitmap, first, last - first);
+
+ extent_count++;
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ flags = btrfs_free_space_flags(leaf, info);
+ flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
+ btrfs_set_free_space_flags(leaf, info, flags);
+ expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ bitmap_cursor = (char *)bitmap;
+ bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+ i = start;
+ while (i < end) {
+ unsigned long ptr;
+ u64 extent_size;
+ u32 data_size;
+
+ extent_size = min(end - i, bitmap_range);
+ data_size = free_space_bitmap_size(extent_size,
+ block_group->sectorsize);
+
+ key.objectid = i;
+ key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
+ key.offset = extent_size;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ data_size);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ write_extent_buffer(leaf, bitmap_cursor, ptr,
+ data_size);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ i += extent_size;
+ bitmap_cursor += data_size;
+ }
+
+ ret = 0;
+out:
+ kvfree(bitmap);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ unsigned long *bitmap;
+ u64 start, end;
+ /* Initialize to silence GCC. */
+ u64 extent_start = 0;
+ u64 offset;
+ u32 bitmap_size, flags, expected_extent_count;
+ int prev_bit = 0, bit, bitnr;
+ u32 extent_count = 0;
+ int done = 0, nr;
+ int ret;
+
+ bitmap_size = free_space_bitmap_size(block_group->key.offset,
+ block_group->sectorsize);
+ bitmap = alloc_bitmap(bitmap_size);
+ if (!bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ unsigned long ptr;
+ char *bitmap_cursor;
+ u32 bitmap_pos, data_size;
+
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+
+ bitmap_pos = div_u64(found_key.objectid - start,
+ block_group->sectorsize *
+ BITS_PER_BYTE);
+ bitmap_cursor = ((char *)bitmap) + bitmap_pos;
+ data_size = free_space_bitmap_size(found_key.offset,
+ block_group->sectorsize);
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
+ read_extent_buffer(leaf, bitmap_cursor, ptr,
+ data_size);
+
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ leaf = path->nodes[0];
+ flags = btrfs_free_space_flags(leaf, info);
+ flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
+ btrfs_set_free_space_flags(leaf, info, flags);
+ expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ offset = start;
+ bitnr = 0;
+ while (offset < end) {
+ bit = !!test_bit(bitnr, bitmap);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ key.objectid = extent_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = offset - extent_start;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+
+ extent_count++;
+ }
+ prev_bit = bit;
+ offset += block_group->sectorsize;
+ bitnr++;
+ }
+ if (prev_bit == 1) {
+ key.objectid = extent_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = end - extent_start;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ kvfree(bitmap);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ int new_extents)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ u32 extent_count;
+ int ret = 0;
+
+ if (new_extents == 0)
+ return 0;
+
+ info = search_free_space_info(trans, fs_info, block_group, path, 1);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+ extent_count += new_extents;
+ btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_release_path(path);
+
+ if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+ extent_count > block_group->bitmap_high_thresh) {
+ ret = convert_free_space_to_bitmaps(trans, fs_info, block_group,
+ path);
+ } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+ extent_count < block_group->bitmap_low_thresh) {
+ ret = convert_free_space_to_extents(trans, fs_info, block_group,
+ path);
+ }
+
+out:
+ return ret;
+}
+
+int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 offset)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 found_start, found_end;
+ unsigned long ptr, i;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(offset >= found_start && offset < found_end);
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ i = div_u64(offset - found_start, block_group->sectorsize);
+ return !!extent_buffer_test_bit(leaf, ptr, i);
+}
+
+static void free_space_set_bits(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 *start, u64 *size,
+ int bit)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u64 end = *start + *size;
+ u64 found_start, found_end;
+ unsigned long ptr, first, last;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(*start >= found_start && *start < found_end);
+ ASSERT(end > found_start);
+
+ if (end > found_end)
+ end = found_end;
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ first = div_u64(*start - found_start, block_group->sectorsize);
+ last = div_u64(end - found_start, block_group->sectorsize);
+ if (bit)
+ extent_buffer_bitmap_set(leaf, ptr, first, last - first);
+ else
+ extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
+ btrfs_mark_buffer_dirty(leaf);
+
+ *size -= end - *start;
+ *start = end;
+}
+
+/*
+ * We can't use btrfs_next_item() in modify_free_space_bitmap() because
+ * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy
+ * tree walking in btrfs_next_leaf() anyways because we know exactly what we're
+ * looking for.
+ */
+static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *p)
+{
+ struct btrfs_key key;
+
+ if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) {
+ p->slots[0]++;
+ return 0;
+ }
+
+ btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]);
+ btrfs_release_path(p);
+
+ key.objectid += key.offset;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ return btrfs_search_prev_slot(trans, root, &key, p, 0, 1);
+}
+
+/*
+ * If remove is 1, then we are removing free space, thus clearing bits in the
+ * bitmap. If remove is 0, then we are adding free space, thus setting bits in
+ * the bitmap.
+ */
+static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size, int remove)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ u64 end = start + size;
+ u64 cur_start, cur_size;
+ int prev_bit, next_bit;
+ int new_extents;
+ int ret;
+
+ /*
+ * Read the bit for the block immediately before the extent of space if
+ * that block is within the block group.
+ */
+ if (start > block_group->key.objectid) {
+ u64 prev_block = start - block_group->sectorsize;
+
+ key.objectid = prev_block;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+ if (ret)
+ goto out;
+
+ prev_bit = free_space_test_bit(block_group, path, prev_block);
+
+ /* The previous block may have been in the previous bitmap. */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (start >= key.objectid + key.offset) {
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+ } else {
+ key.objectid = start;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+ if (ret)
+ goto out;
+
+ prev_bit = -1;
+ }
+
+ /*
+ * Iterate over all of the bitmaps overlapped by the extent of space,
+ * clearing/setting bits as required.
+ */
+ cur_start = start;
+ cur_size = size;
+ while (1) {
+ free_space_set_bits(block_group, path, &cur_start, &cur_size,
+ !remove);
+ if (cur_size == 0)
+ break;
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+
+ /*
+ * Read the bit for the block immediately after the extent of space if
+ * that block is within the block group.
+ */
+ if (end < block_group->key.objectid + block_group->key.offset) {
+ /* The next block may be in the next bitmap. */
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (end >= key.objectid + key.offset) {
+ ret = free_space_next_bitmap(trans, root, path);
+ if (ret)
+ goto out;
+ }
+
+ next_bit = free_space_test_bit(block_group, path, end);
+ } else {
+ next_bit = -1;
+ }
+
+ if (remove) {
+ new_extents = -1;
+ if (prev_bit == 1) {
+ /* Leftover on the left. */
+ new_extents++;
+ }
+ if (next_bit == 1) {
+ /* Leftover on the right. */
+ new_extents++;
+ }
+ } else {
+ new_extents = 1;
+ if (prev_bit == 1) {
+ /* Merging with neighbor on the left. */
+ new_extents--;
+ }
+ if (next_bit == 1) {
+ /* Merging with neighbor on the right. */
+ new_extents--;
+ }
+ }
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+static int remove_free_space_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key;
+ u64 found_start, found_end;
+ u64 end = start + size;
+ int new_extents = -1;
+ int ret;
+
+ key.objectid = start;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(start >= found_start && end <= found_end);
+
+ /*
+ * Okay, now that we've found the free space extent which contains the
+ * free space that we are removing, there are four cases:
+ *
+ * 1. We're using the whole extent: delete the key we found and
+ * decrement the free space extent count.
+ * 2. We are using part of the extent starting at the beginning: delete
+ * the key we found and insert a new key representing the leftover at
+ * the end. There is no net change in the number of extents.
+ * 3. We are using part of the extent ending at the end: delete the key
+ * we found and insert a new key representing the leftover at the
+ * beginning. There is no net change in the number of extents.
+ * 4. We are using part of the extent in the middle: delete the key we
+ * found and insert two new keys representing the leftovers on each
+ * side. Where we used to have one extent, we now have two, so increment
+ * the extent count. We may need to convert the block group to bitmaps
+ * as a result.
+ */
+
+ /* Delete the existing key (cases 1-4). */
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+
+ /* Add a key for leftovers at the beginning (cases 3 and 4). */
+ if (start > found_start) {
+ key.objectid = found_start;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = start - found_start;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ new_extents++;
+ }
+
+ /* Add a key for leftovers at the end (cases 2 and 4). */
+ if (end < found_end) {
+ key.objectid = end;
+ key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ key.offset = found_end - end;
+
+ btrfs_release_path(path);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+ if (ret)
+ goto out;
+ new_extents++;
+ }
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ if (block_group->needs_free_space) {
+ ret = __add_block_group_free_space(trans, fs_info, block_group,
+ path);
+ if (ret)
+ return ret;
+ }
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ return modify_free_space_bitmap(trans, fs_info, block_group,
+ path, start, size, 1);
+ } else {
+ return remove_free_space_extent(trans, fs_info, block_group,
+ path, start, size);
+ }
+}
+
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_path *path;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ block_group = btrfs_lookup_block_group(fs_info, start);
+ if (!block_group) {
+ ASSERT(0);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mutex_lock(&block_group->free_space_lock);
+ ret = __remove_from_free_space_tree(trans, fs_info, block_group, path,
+ start, size);
+ mutex_unlock(&block_group->free_space_lock);
+
+ btrfs_put_block_group(block_group);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+static int add_free_space_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path,
+ u64 start, u64 size)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_key key, new_key;
+ u64 found_start, found_end;
+ u64 end = start + size;
+ int new_extents = 1;
+ int ret;
+
+ /*
+ * We are adding a new extent of free space, but we need to merge
+ * extents. There are four cases here:
+ *
+ * 1. The new extent does not have any immediate neighbors to merge
+ * with: add the new key and increment the free space extent count. We
+ * may need to convert the block group to bitmaps as a result.
+ * 2. The new extent has an immediate neighbor before it: remove the
+ * previous key and insert a new key combining both of them. There is no
+ * net change in the number of extents.
+ * 3. The new extent has an immediate neighbor after it: remove the next
+ * key and insert a new key combining both of them. There is no net
+ * change in the number of extents.
+ * 4. The new extent has immediate neighbors on both sides: remove both
+ * of the keys and insert a new key combining all of them. Where we used
+ * to have two extents, we now have one, so decrement the extent count.
+ */
+
+ new_key.objectid = start;
+ new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+ new_key.offset = size;
+
+ /* Search for a neighbor on the left. */
+ if (start == block_group->key.objectid)
+ goto right;
+ key.objectid = start - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+ ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+ btrfs_release_path(path);
+ goto right;
+ }
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(found_start >= block_group->key.objectid &&
+ found_end > block_group->key.objectid);
+ ASSERT(found_start < start && found_end <= start);
+
+ /*
+ * Delete the neighbor on the left and absorb it into the new key (cases
+ * 2 and 4).
+ */
+ if (found_end == start) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ new_key.objectid = found_start;
+ new_key.offset += key.offset;
+ new_extents--;
+ }
+ btrfs_release_path(path);
+
+right:
+ /* Search for a neighbor on the right. */
+ if (end == block_group->key.objectid + block_group->key.offset)
+ goto insert;
+ key.objectid = end;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+ ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+ btrfs_release_path(path);
+ goto insert;
+ }
+
+ found_start = key.objectid;
+ found_end = key.objectid + key.offset;
+ ASSERT(found_start >= block_group->key.objectid &&
+ found_end > block_group->key.objectid);
+ ASSERT((found_start < start && found_end <= start) ||
+ (found_start >= end && found_end > end));
+
+ /*
+ * Delete the neighbor on the right and absorb it into the new key
+ * (cases 3 and 4).
+ */
+ if (found_start == end) {
+ ret = btrfs_del_item(trans, root, path);
+ if (ret)
+ goto out;
+ new_key.offset += key.offset;
+ new_extents--;
+ }
+ btrfs_release_path(path);
+
+insert:
+ /* Insert the new key (cases 1-4). */
+ ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+ new_extents);
+
+out:
+ return ret;
+}
+
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ if (block_group->needs_free_space) {
+ ret = __add_block_group_free_space(trans, fs_info, block_group,
+ path);
+ if (ret)
+ return ret;
+ }
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ return modify_free_space_bitmap(trans, fs_info, block_group,
+ path, start, size, 0);
+ } else {
+ return add_free_space_extent(trans, fs_info, block_group, path,
+ start, size);
+ }
+}
+
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_path *path;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ block_group = btrfs_lookup_block_group(fs_info, start);
+ if (!block_group) {
+ ASSERT(0);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mutex_lock(&block_group->free_space_lock);
+ ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start,
+ size);
+ mutex_unlock(&block_group->free_space_lock);
+
+ btrfs_put_block_group(block_group);
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+/*
+ * Populate the free space tree by walking the extent tree. Operations on the
+ * extent tree that happen as a result of writes to the free space tree will go
+ * through the normal add/remove hooks.
+ */
+static int populate_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ struct btrfs_path *path, *path2;
+ struct btrfs_key key;
+ u64 start, end;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->reada = 1;
+
+ path2 = btrfs_alloc_path();
+ if (!path2) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
+
+ ret = add_new_free_space_info(trans, fs_info, block_group, path2);
+ if (ret)
+ goto out;
+
+ mutex_lock(&block_group->free_space_lock);
+
+ /*
+ * Iterate through all of the extent and metadata items in this block
+ * group, adding the free space between them and the free space at the
+ * end. Note that EXTENT_ITEM and METADATA_ITEM are less than
+ * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
+ * contained in.
+ */
+ key.objectid = block_group->key.objectid;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
+ if (ret < 0)
+ goto out_locked;
+ ASSERT(ret == 0);
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+ while (1) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+ key.type == BTRFS_METADATA_ITEM_KEY) {
+ if (key.objectid >= end)
+ break;
+
+ if (start < key.objectid) {
+ ret = __add_to_free_space_tree(trans, fs_info,
+ block_group,
+ path2, start,
+ key.objectid -
+ start);
+ if (ret)
+ goto out_locked;
+ }
+ start = key.objectid;
+ if (key.type == BTRFS_METADATA_ITEM_KEY)
+ start += fs_info->tree_root->nodesize;
+ else
+ start += key.offset;
+ } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ if (key.objectid != block_group->key.objectid)
+ break;
+ }
+
+ ret = btrfs_next_item(extent_root, path);
+ if (ret < 0)
+ goto out_locked;
+ if (ret)
+ break;
+ }
+ if (start < end) {
+ ret = __add_to_free_space_tree(trans, fs_info, block_group,
+ path2, start, end - start);
+ if (ret)
+ goto out_locked;
+ }
+
+ ret = 0;
+out_locked:
+ mutex_unlock(&block_group->free_space_lock);
+out:
+ btrfs_free_path(path2);
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *free_space_root;
+ struct btrfs_block_group_cache *block_group;
+ struct rb_node *node;
+ int ret;
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ fs_info->creating_free_space_tree = 1;
+ free_space_root = btrfs_create_tree(trans, fs_info,
+ BTRFS_FREE_SPACE_TREE_OBJECTID);
+ if (IS_ERR(free_space_root)) {
+ ret = PTR_ERR(free_space_root);
+ goto abort;
+ }
+ fs_info->free_space_root = free_space_root;
+
+ node = rb_first(&fs_info->block_group_cache_tree);
+ while (node) {
+ block_group = rb_entry(node, struct btrfs_block_group_cache,
+ cache_node);
+ ret = populate_free_space_tree(trans, fs_info, block_group);
+ if (ret)
+ goto abort;
+ node = rb_next(node);
+ }
+
+ btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ fs_info->creating_free_space_tree = 0;
+
+ ret = btrfs_commit_transaction(trans, tree_root);
+ if (ret)
+ return ret;
+
+ return 0;
+
+abort:
+ fs_info->creating_free_space_tree = 0;
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_end_transaction(trans, tree_root);
+ return ret;
+}
+
+static int clear_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int nr;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->leave_spinning = 1;
+
+ key.objectid = 0;
+ key.type = 0;
+ key.offset = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+
+ nr = btrfs_header_nritems(path->nodes[0]);
+ if (!nr)
+ break;
+
+ path->slots[0] = 0;
+ ret = btrfs_del_items(trans, root, path, 0, nr);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(path);
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_root *free_space_root = fs_info->free_space_root;
+ int ret;
+
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+ fs_info->free_space_root = NULL;
+
+ ret = clear_free_space_tree(trans, free_space_root);
+ if (ret)
+ goto abort;
+
+ ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key);
+ if (ret)
+ goto abort;
+
+ list_del(&free_space_root->dirty_list);
+
+ btrfs_tree_lock(free_space_root->node);
+ clean_tree_block(trans, tree_root->fs_info, free_space_root->node);
+ btrfs_tree_unlock(free_space_root->node);
+ btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
+ 0, 1);
+
+ free_extent_buffer(free_space_root->node);
+ free_extent_buffer(free_space_root->commit_root);
+ kfree(free_space_root);
+
+ ret = btrfs_commit_transaction(trans, tree_root);
+ if (ret)
+ return ret;
+
+ return 0;
+
+abort:
+ btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_end_transaction(trans, tree_root);
+ return ret;
+}
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ u64 start, end;
+ int ret;
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ block_group->needs_free_space = 0;
+
+ ret = add_new_free_space_info(trans, fs_info, block_group, path);
+ if (ret)
+ return ret;
+
+ return __add_to_free_space_tree(trans, fs_info, block_group, path,
+ block_group->key.objectid,
+ block_group->key.offset);
+}
+
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_path *path = NULL;
+ int ret = 0;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ mutex_lock(&block_group->free_space_lock);
+ if (!block_group->needs_free_space)
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = __add_block_group_free_space(trans, fs_info, block_group, path);
+
+out:
+ btrfs_free_path(path);
+ mutex_unlock(&block_group->free_space_lock);
+ if (ret)
+ btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ return ret;
+}
+
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_root *root = fs_info->free_space_root;
+ struct btrfs_path *path;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ u64 start, end;
+ int done = 0, nr;
+ int ret;
+
+ if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ return 0;
+
+ if (block_group->needs_free_space) {
+ /* We never added this block group to the free space tree. */
+ return 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ start = block_group->key.objectid;
+ end = block_group->key.objectid + block_group->key.offset;
+
+ key.objectid = end - 1;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ while (!done) {
+ ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ nr = 0;
+ path->slots[0]++;
+ while (path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+ if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+ ASSERT(found_key.objectid == block_group->key.objectid);
+ ASSERT(found_key.offset == block_group->key.offset);
+ done = 1;
+ nr++;
+ path->slots[0]--;
+ break;
+ } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY ||
+ found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ ASSERT(found_key.objectid >= start);
+ ASSERT(found_key.objectid < end);
+ ASSERT(found_key.objectid + found_key.offset <= end);
+ nr++;
+ path->slots[0]--;
+ } else {
+ ASSERT(0);
+ }
+ }
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+ if (ret)
+ goto out;
+ btrfs_release_path(path);
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+}
+
+static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_path *path,
+ u32 expected_extent_count)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ int prev_bit = 0, bit;
+ /* Initialize to silence GCC. */
+ u64 extent_start = 0;
+ u64 end, offset;
+ u64 total_found = 0;
+ u32 extent_count = 0;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ root = fs_info->free_space_root;
+
+ end = block_group->key.objectid + block_group->key.offset;
+
+ while (1) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+ break;
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+ ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+ caching_ctl->progress = key.objectid;
+
+ offset = key.objectid;
+ while (offset < key.objectid + key.offset) {
+ bit = free_space_test_bit(block_group, path, offset);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ total_found += add_new_free_space(block_group,
+ fs_info,
+ extent_start,
+ offset);
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ extent_count++;
+ }
+ prev_bit = bit;
+ offset += block_group->sectorsize;
+ }
+ }
+ if (prev_bit == 1) {
+ total_found += add_new_free_space(block_group, fs_info,
+ extent_start, end);
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ caching_ctl->progress = (u64)-1;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
+ struct btrfs_path *path,
+ u32 expected_extent_count)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root;
+ struct btrfs_key key;
+ u64 end;
+ u64 total_found = 0;
+ u32 extent_count = 0;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+ root = fs_info->free_space_root;
+
+ end = block_group->key.objectid + block_group->key.offset;
+
+ while (1) {
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ goto out;
+ if (ret)
+ break;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+ break;
+
+ ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+ ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+ caching_ctl->progress = key.objectid;
+
+ total_found += add_new_free_space(block_group, fs_info,
+ key.objectid,
+ key.objectid + key.offset);
+ if (total_found > CACHING_CTL_WAKE_UP) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
+ extent_count++;
+ }
+
+ if (extent_count != expected_extent_count) {
+ btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+ block_group->key.objectid, extent_count,
+ expected_extent_count);
+ ASSERT(0);
+ ret = -EIO;
+ goto out;
+ }
+
+ caching_ctl->progress = (u64)-1;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
+{
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_free_space_info *info;
+ struct btrfs_path *path;
+ u32 extent_count, flags;
+ int ret;
+
+ block_group = caching_ctl->block_group;
+ fs_info = block_group->fs_info;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * Just like caching_thread() doesn't want to deadlock on the extent
+ * tree, we don't want to deadlock on the free space tree.
+ */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = 1;
+
+ info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+ if (IS_ERR(info)) {
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+
+ /*
+ * We left path pointing to the free space info item, so now
+ * load_free_space_foo can just iterate through the free space tree from
+ * there.
+ */
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
+ ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
+ else
+ ret = load_free_space_extents(caching_ctl, path, extent_count);
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
new file mode 100644
index 000000000000..54ffced3bce8
--- /dev/null
+++ b/fs/btrfs/free-space-tree.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_TREE
+#define __BTRFS_FREE_SPACE_TREE
+
+/*
+ * The default size for new free space bitmap items. The last bitmap in a block
+ * group may be truncated, and none of the free space tree code assumes that
+ * existing bitmaps are this size.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
+#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
+
+void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group);
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
+int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group);
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group);
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size);
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ u64 start, u64 size);
+
+/* Exposed for testing. */
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, int cow);
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path);
+int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path, u64 offset);
+
+#endif
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 767a6056ac45..70107f7c9307 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -48,7 +48,7 @@ static int caching_kthread(void *data)
/* Since the commit root is read-only, we can safely skip locking. */
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 2;
+ path->reada = READA_FORWARD;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.offset = 0;
@@ -282,8 +282,8 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
}
}
-#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
-#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space))
+#define INODES_PER_BITMAP (PAGE_SIZE * 8)
/*
* The goal is to keep the memory used by the free_ino tree won't
@@ -317,7 +317,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
}
ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
- PAGE_CACHE_SIZE / sizeof(*info);
+ PAGE_SIZE / sizeof(*info);
}
/*
@@ -334,7 +334,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
return true;
}
-static struct btrfs_free_space_op free_ino_op = {
+static const struct btrfs_free_space_op free_ino_op = {
.recalc_thresholds = recalculate_thresholds,
.use_bitmap = use_bitmap,
};
@@ -356,7 +356,7 @@ static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
return false;
}
-static struct btrfs_free_space_op pinned_free_ino_op = {
+static const struct btrfs_free_space_op pinned_free_ino_op = {
.recalc_thresholds = pinned_recalc_thresholds,
.use_bitmap = pinned_use_bitmap,
};
@@ -481,12 +481,12 @@ again:
spin_lock(&ctl->tree_lock);
prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
- prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
- prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
+ prealloc = ALIGN(prealloc, PAGE_SIZE);
+ prealloc += ctl->total_bitmaps * PAGE_SIZE;
spin_unlock(&ctl->tree_lock);
/* Just to make sure we have enough space */
- prealloc += 8 * PAGE_CACHE_SIZE;
+ prealloc += 8 * PAGE_SIZE;
ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
if (ret)
@@ -515,7 +515,7 @@ out:
return ret;
}
-static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
{
struct btrfs_path *path;
int ret;
@@ -555,14 +555,10 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
int ret;
mutex_lock(&root->objectid_mutex);
- if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
- ret = btrfs_find_highest_objectid(root,
- &root->highest_objectid);
- if (ret)
- goto out;
- }
-
if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ btrfs_warn(root->fs_info,
+ "the objectid of root %llu reaches its highest value",
+ root->root_key.objectid);
ret = -ENOSPC;
goto out;
}
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
index ddb347bfee23..c8e864b2d530 100644
--- a/fs/btrfs/inode-map.h
+++ b/fs/btrfs/inode-map.h
@@ -9,5 +9,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans);
int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid);
#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4439fbb4ff45..2aaba58b4856 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,13 @@ struct btrfs_iget_args {
struct btrfs_root *root;
};
+struct btrfs_dio_data {
+ u64 outstanding_extents;
+ u64 reserve;
+ u64 unsubmitted_oe_range_start;
+ u64 unsubmitted_oe_range_end;
+};
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_dir_ro_inode_operations;
@@ -74,17 +81,16 @@ static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct address_space_operations btrfs_symlink_aops;
static const struct file_operations btrfs_dir_file_operations;
-static struct extent_io_ops btrfs_extent_io_ops;
+static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
-static struct kmem_cache *btrfs_delalloc_work_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
struct kmem_cache *btrfs_free_space_cachep;
#define S_SHIFT 12
-static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
[S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
@@ -188,7 +194,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
while (compressed_size > 0) {
cpage = compressed_pages[i];
cur_size = min_t(unsigned long, compressed_size,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
kaddr = kmap_atomic(cpage);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
@@ -202,13 +208,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
compress_type);
} else {
page = find_get_page(inode->i_mapping,
- start >> PAGE_CACHE_SHIFT);
+ start >> PAGE_SHIFT);
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_atomic(page);
- offset = start & (PAGE_CACHE_SIZE - 1);
+ offset = start & (PAGE_SIZE - 1);
write_extent_buffer(leaf, kaddr + offset, ptr, size);
kunmap_atomic(kaddr);
- page_cache_release(page);
+ put_page(page);
}
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
@@ -257,7 +263,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
data_len = compressed_size;
if (start > 0 ||
- actual_end > PAGE_CACHE_SIZE ||
+ actual_end > root->sectorsize ||
data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
(!compressed_size &&
(actual_end & (root->sectorsize - 1)) == 0) ||
@@ -316,7 +322,7 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
+ btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
return ret;
@@ -414,23 +420,23 @@ static noinline void compress_file_range(struct inode *inode,
unsigned long nr_pages_ret = 0;
unsigned long total_compressed = 0;
unsigned long total_in = 0;
- unsigned long max_compressed = 128 * 1024;
- unsigned long max_uncompressed = 128 * 1024;
+ unsigned long max_compressed = SZ_128K;
+ unsigned long max_uncompressed = SZ_128K;
int i;
int will_compress;
int compress_type = root->fs_info->compress_type;
int redirty = 0;
/* if this is a small write inside eof, kick off a defrag */
- if ((end - start + 1) < 16 * 1024 &&
+ if ((end - start + 1) < SZ_16K &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
actual_end = min_t(u64, isize, end + 1);
again:
will_compress = 0;
- nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
- nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+ nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+ nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE);
/*
* we don't want to send crud past the end of i_size through
@@ -508,7 +514,7 @@ again:
if (!ret) {
unsigned long offset = total_compressed &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
struct page *page = pages[nr_pages_ret - 1];
char *kaddr;
@@ -518,7 +524,7 @@ again:
if (offset) {
kaddr = kmap_atomic(page);
memset(kaddr + offset, 0,
- PAGE_CACHE_SIZE - offset);
+ PAGE_SIZE - offset);
kunmap_atomic(kaddr);
}
will_compress = 1;
@@ -574,7 +580,7 @@ cont:
* one last check to make sure the compression is really a
* win, compare the page count read with the blocks on disk
*/
- total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
+ total_in = ALIGN(total_in, PAGE_SIZE);
if (total_compressed >= total_in) {
will_compress = 0;
} else {
@@ -588,7 +594,7 @@ cont:
*/
for (i = 0; i < nr_pages_ret; i++) {
WARN_ON(pages[i]->mapping);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
kfree(pages);
pages = NULL;
@@ -644,7 +650,7 @@ cleanup_and_bail_uncompressed:
free_pages_out:
for (i = 0; i < nr_pages_ret; i++) {
WARN_ON(pages[i]->mapping);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
kfree(pages);
}
@@ -658,7 +664,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
for (i = 0; i < async_extent->nr_pages; i++) {
WARN_ON(async_extent->pages[i]->mapping);
- page_cache_release(async_extent->pages[i]);
+ put_page(async_extent->pages[i]);
}
kfree(async_extent->pages);
async_extent->nr_pages = 0;
@@ -944,7 +950,7 @@ static noinline int cow_file_range(struct inode *inode,
disk_num_bytes = num_bytes;
/* if this is a small write inside eof, kick off defrag */
- if (num_bytes < 64 * 1024 &&
+ if (num_bytes < SZ_64K &&
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
@@ -960,7 +966,7 @@ static noinline int cow_file_range(struct inode *inode,
PAGE_END_WRITEBACK);
*nr_written = *nr_written +
- (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+ (end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
goto out;
} else if (ret < 0) {
@@ -1100,14 +1106,14 @@ static noinline void async_cow_submit(struct btrfs_work *work)
async_cow = container_of(work, struct async_cow, work);
root = async_cow->root;
- nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
+ nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
+ PAGE_SHIFT;
/*
* atomic_sub_return implies a barrier for waitqueue_active
*/
if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
- 5 * 1024 * 1024 &&
+ 5 * SZ_1M &&
waitqueue_active(&root->fs_info->async_submit_wait))
wake_up(&root->fs_info->async_submit_wait);
@@ -1132,7 +1138,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
struct btrfs_root *root = BTRFS_I(inode)->root;
unsigned long nr_pages;
u64 cur_end;
- int limit = 10 * 1024 * 1024;
+ int limit = 10 * SZ_1M;
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1, 0, NULL, GFP_NOFS);
@@ -1148,7 +1154,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
!btrfs_test_opt(root, FORCE_COMPRESS))
cur_end = end;
else
- cur_end = min(end, start + 512 * 1024 - 1);
+ cur_end = min(end, start + SZ_512K - 1);
async_cow->end = cur_end;
INIT_LIST_HEAD(&async_cow->extents);
@@ -1158,8 +1164,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow_start, async_cow_submit,
async_cow_free);
- nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
+ nr_pages = (cur_end - start + PAGE_SIZE) >>
+ PAGE_SHIFT;
atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
btrfs_queue_work(root->fs_info->delalloc_workers,
@@ -1304,8 +1310,14 @@ next_slot:
num_bytes = 0;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid > ino ||
- found_key.type > BTRFS_EXTENT_DATA_KEY ||
+ if (found_key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(found_key.objectid < ino) ||
+ found_key.type < BTRFS_EXTENT_DATA_KEY) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
found_key.offset > end)
break;
@@ -1948,7 +1960,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state)
{
- WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
+ WARN_ON((end & (PAGE_SIZE - 1)) == 0);
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
cached_state, GFP_NOFS);
}
@@ -1981,16 +1993,17 @@ again:
inode = page->mapping->host;
page_start = page_offset(page);
- page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+ page_end = page_offset(page) + PAGE_SIZE - 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
/* already ordered? We're done */
if (PagePrivate2(page))
goto out;
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ ordered = btrfs_lookup_ordered_range(inode, page_start,
+ PAGE_SIZE);
if (ordered) {
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
page_end, &cached_state, GFP_NOFS);
@@ -2001,7 +2014,7 @@ again:
}
ret = btrfs_delalloc_reserve_space(inode, page_start,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
@@ -2017,7 +2030,7 @@ out:
&cached_state, GFP_NOFS);
out_page:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
kfree(fixup);
}
@@ -2050,7 +2063,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
return -EAGAIN;
SetPageChecked(page);
- page_cache_get(page);
+ get_page(page);
btrfs_init_work(&fixup->work, btrfs_fixup_helper,
btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
@@ -2476,7 +2489,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
lock_start = backref->file_pos;
lock_end = backref->file_pos + backref->num_bytes - 1;
lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
- 0, &cached);
+ &cached);
ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
if (ordered) {
@@ -2868,7 +2881,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
lock_extent_bits(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
- 0, &cached_state);
+ &cached_state);
ret = test_range_bit(io_tree, ordered_extent->file_offset,
ordered_extent->file_offset + ordered_extent->len - 1,
@@ -3100,56 +3113,46 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
start, (size_t)(end - start + 1));
}
-struct delayed_iput {
- struct list_head list;
- struct inode *inode;
-};
-
-/* JDM: If this is fs-wide, why can't we add a pointer to
- * btrfs_inode instead and avoid the allocation? */
void btrfs_add_delayed_iput(struct inode *inode)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
- struct delayed_iput *delayed;
+ struct btrfs_inode *binode = BTRFS_I(inode);
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
- delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
- delayed->inode = inode;
-
spin_lock(&fs_info->delayed_iput_lock);
- list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+ if (binode->delayed_iput_count == 0) {
+ ASSERT(list_empty(&binode->delayed_iput));
+ list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+ } else {
+ binode->delayed_iput_count++;
+ }
spin_unlock(&fs_info->delayed_iput_lock);
}
void btrfs_run_delayed_iputs(struct btrfs_root *root)
{
- LIST_HEAD(list);
struct btrfs_fs_info *fs_info = root->fs_info;
- struct delayed_iput *delayed;
- int empty;
-
- spin_lock(&fs_info->delayed_iput_lock);
- empty = list_empty(&fs_info->delayed_iputs);
- spin_unlock(&fs_info->delayed_iput_lock);
- if (empty)
- return;
-
- down_read(&fs_info->delayed_iput_sem);
spin_lock(&fs_info->delayed_iput_lock);
- list_splice_init(&fs_info->delayed_iputs, &list);
- spin_unlock(&fs_info->delayed_iput_lock);
-
- while (!list_empty(&list)) {
- delayed = list_entry(list.next, struct delayed_iput, list);
- list_del(&delayed->list);
- iput(delayed->inode);
- kfree(delayed);
+ while (!list_empty(&fs_info->delayed_iputs)) {
+ struct btrfs_inode *inode;
+
+ inode = list_first_entry(&fs_info->delayed_iputs,
+ struct btrfs_inode, delayed_iput);
+ if (inode->delayed_iput_count) {
+ inode->delayed_iput_count--;
+ list_move_tail(&inode->delayed_iput,
+ &fs_info->delayed_iputs);
+ } else {
+ list_del_init(&inode->delayed_iput);
+ }
+ spin_unlock(&fs_info->delayed_iput_lock);
+ iput(&inode->vfs_inode);
+ spin_lock(&fs_info->delayed_iput_lock);
}
-
- up_read(&root->fs_info->delayed_iput_sem);
+ spin_unlock(&fs_info->delayed_iput_lock);
}
/*
@@ -3345,7 +3348,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
ret = -ENOMEM;
goto out;
}
- path->reada = -1;
+ path->reada = READA_BACK;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
@@ -3544,10 +3547,10 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
int scanned = 0;
if (!xattr_access) {
- xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS));
- xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT));
+ xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
+ xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
}
slot++;
@@ -3768,6 +3771,7 @@ cache_acl:
break;
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
break;
default:
@@ -4010,7 +4014,8 @@ err:
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
inode_inc_iversion(inode);
inode_inc_iversion(dir);
- inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ inode->i_ctime = dir->i_mtime =
+ dir->i_ctime = current_fs_time(inode->i_sb);
ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
@@ -4040,9 +4045,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
*/
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
- int ret;
/*
* 1 for the possible orphan item
@@ -4051,27 +4054,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the inode ref
* 1 for the inode
*/
- trans = btrfs_start_transaction(root, 5);
- if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
- return trans;
-
- if (PTR_ERR(trans) == -ENOSPC) {
- u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return trans;
- ret = btrfs_cond_migrate_bytes(root->fs_info,
- &root->fs_info->trans_block_rsv,
- num_bytes, 5);
- if (ret) {
- btrfs_end_transaction(trans, root);
- return ERR_PTR(ret);
- }
- trans->block_rsv = &root->fs_info->trans_block_rsv;
- trans->bytes_reserved = num_bytes;
- }
- return trans;
+ return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -4175,7 +4158,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
btrfs_abort_transaction(trans, root, ret);
@@ -4230,11 +4213,20 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
{
int ret;
+ /*
+ * This is only used to apply pressure to the enospc system, we don't
+ * intend to use this reservation at all.
+ */
bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
+ bytes_deleted *= root->nodesize;
ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
- if (!ret)
+ if (!ret) {
+ trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trans->transid,
+ bytes_deleted, 1);
trans->bytes_reserved += bytes_deleted;
+ }
return ret;
}
@@ -4255,7 +4247,7 @@ static int truncate_inline_extent(struct inode *inode,
if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
loff_t offset = new_size;
- loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+ loff_t page_end = ALIGN(offset, PAGE_SIZE);
/*
* Zero out the remaining of the last page of our inline extent,
@@ -4267,7 +4259,8 @@ static int truncate_inline_extent(struct inode *inode,
* read the extent item from disk (data not in the page cache).
*/
btrfs_release_path(path);
- return btrfs_truncate_page(inode, offset, page_end - offset, 0);
+ return btrfs_truncate_block(inode, offset, page_end - offset,
+ 0);
}
btrfs_set_file_extent_ram_bytes(leaf, fi, size);
@@ -4333,7 +4326,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = -1;
+ path->reada = READA_BACK;
/*
* We want to drop from the next block forward in case this new size is
@@ -4364,7 +4357,7 @@ search_again:
* up a huge file in a single leaf. Most of the time that
* bytes_deleted is > 0, it will be huge by the time we get here
*/
- if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (be_nice && bytes_deleted > SZ_32M) {
if (btrfs_should_end_transaction(trans, root)) {
err = -EAGAIN;
goto error;
@@ -4607,7 +4600,7 @@ error:
btrfs_free_path(path);
- if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+ if (be_nice && bytes_deleted > SZ_32M) {
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
trans->delayed_ref_updates = 0;
@@ -4620,17 +4613,17 @@ error:
}
/*
- * btrfs_truncate_page - read, zero a chunk and write a page
+ * btrfs_truncate_block - read, zero a chunk and write a block
* @inode - inode that we're zeroing
* @from - the offset to start zeroing
* @len - the length to zero, 0 to zero the entire range respective to the
* offset
* @front - zero up to the offset instead of from the offset on
*
- * This will find the page for the "from" offset and cow the page and zero the
+ * This will find the block for the "from" offset and cow the block and zero the
* part we want to zero. This is used with truncate and hole punching.
*/
-int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
int front)
{
struct address_space *mapping = inode->i_mapping;
@@ -4640,19 +4633,20 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
struct extent_state *cached_state = NULL;
char *kaddr;
u32 blocksize = root->sectorsize;
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ pgoff_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (blocksize - 1);
struct page *page;
gfp_t mask = btrfs_alloc_write_mask(mapping);
int ret = 0;
- u64 page_start;
- u64 page_end;
+ u64 block_start;
+ u64 block_end;
if ((offset & (blocksize - 1)) == 0 &&
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
+
ret = btrfs_delalloc_reserve_space(inode,
- round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
+ round_down(from, blocksize), blocksize);
if (ret)
goto out;
@@ -4660,21 +4654,21 @@ again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode,
- round_down(from, PAGE_CACHE_SIZE),
- PAGE_CACHE_SIZE);
+ round_down(from, blocksize),
+ blocksize);
ret = -ENOMEM;
goto out;
}
- page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
+ block_start = round_down(from, blocksize);
+ block_end = block_start + blocksize - 1;
if (!PageUptodate(page)) {
ret = btrfs_readpage(NULL, page);
lock_page(page);
if (page->mapping != mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto again;
}
if (!PageUptodate(page)) {
@@ -4684,55 +4678,57 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(io_tree, block_start, block_end, &cached_state);
set_page_extent_mapped(page);
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
- unlock_extent_cached(io_tree, page_start, page_end,
+ unlock_extent_cached(io_tree, block_start, block_end,
&cached_state, GFP_NOFS);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state, GFP_NOFS);
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+ ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
&cached_state);
if (ret) {
- unlock_extent_cached(io_tree, page_start, page_end,
+ unlock_extent_cached(io_tree, block_start, block_end,
&cached_state, GFP_NOFS);
goto out_unlock;
}
- if (offset != PAGE_CACHE_SIZE) {
+ if (offset != blocksize) {
if (!len)
- len = PAGE_CACHE_SIZE - offset;
+ len = blocksize - offset;
kaddr = kmap(page);
if (front)
- memset(kaddr, 0, offset);
+ memset(kaddr + (block_start - page_offset(page)),
+ 0, offset);
else
- memset(kaddr + offset, 0, len);
+ memset(kaddr + (block_start - page_offset(page)) + offset,
+ 0, len);
flush_dcache_page(page);
kunmap(page);
}
ClearPageChecked(page);
set_page_dirty(page);
- unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+ unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
GFP_NOFS);
out_unlock:
if (ret)
- btrfs_delalloc_release_space(inode, page_start,
- PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, block_start,
+ blocksize);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out:
return ret;
}
@@ -4801,11 +4797,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
int err = 0;
/*
- * If our size started in the middle of a page we need to zero out the
- * rest of the page before we expand the i_size, otherwise we could
+ * If our size started in the middle of a block we need to zero out the
+ * rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
- err = btrfs_truncate_page(inode, oldsize, 0, 0);
+ err = btrfs_truncate_block(inode, oldsize, 0, 0);
if (err)
return err;
@@ -4815,7 +4811,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+ lock_extent_bits(io_tree, hole_start, block_end - 1,
&cached_state);
ordered = btrfs_lookup_ordered_range(inode, hole_start,
block_end - hole_start);
@@ -4891,26 +4887,6 @@ next:
return err;
}
-static int wait_snapshoting_atomic_t(atomic_t *a)
-{
- schedule();
- return 0;
-}
-
-static void wait_for_snapshot_creation(struct btrfs_root *root)
-{
- while (true) {
- int ret;
-
- ret = btrfs_start_write_no_snapshoting(root);
- if (ret)
- break;
- wait_on_atomic_t(&root->will_be_snapshoted,
- wait_snapshoting_atomic_t,
- TASK_UNINTERRUPTIBLE);
- }
-}
-
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4934,7 +4910,6 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
}
if (newsize > oldsize) {
- truncate_pagecache(inode, newsize);
/*
* Don't do an expanding truncate while snapshoting is ongoing.
* This is to ensure the snapshot captures a fully consistent
@@ -4942,7 +4917,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* truncation, it must capture all writes that happened before
* this truncation.
*/
- wait_for_snapshot_creation(root);
+ btrfs_wait_for_snapshot_creation(root);
ret = btrfs_cont_expand(inode, oldsize, newsize);
if (ret) {
btrfs_end_write_no_snapshoting(root);
@@ -4957,6 +4932,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
i_size_write(inode, newsize);
btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+ pagecache_isize_extended(inode, oldsize, newsize);
ret = btrfs_update_inode(trans, root, inode);
btrfs_end_write_no_snapshoting(root);
btrfs_end_transaction(trans, root);
@@ -5127,7 +5103,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
end = state->end;
spin_unlock(&io_tree->lock);
- lock_extent_bits(io_tree, start, end, 0, &cached_state);
+ lock_extent_bits(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
@@ -5320,7 +5296,6 @@ void btrfs_evict_inode(struct inode *inode)
no_delete:
btrfs_remove_delayed_node(inode);
clear_inode(inode);
- return;
}
/*
@@ -5628,7 +5603,7 @@ static struct inode *new_simple_dir(struct super_block *s,
inode->i_op = &btrfs_dir_ro_inode_operations;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- inode->i_mtime = CURRENT_TIME;
+ inode->i_mtime = current_fs_time(inode->i_sb);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
@@ -5757,6 +5732,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
char *name_ptr;
int name_len;
int is_curr = 0; /* ctx->pos points to the current index? */
+ bool emitted;
/* FIXME, use a real flag for deciding about the key type */
if (root->fs_info->tree_root == root)
@@ -5769,7 +5745,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
if (key_type == BTRFS_DIR_INDEX_KEY) {
INIT_LIST_HEAD(&ins_list);
@@ -5785,6 +5761,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (ret < 0)
goto err;
+ emitted = false;
while (1) {
leaf = path->nodes[0];
slot = path->slots[0];
@@ -5828,7 +5805,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
if (name_len <= sizeof(tmp_name)) {
name_ptr = tmp_name;
} else {
- name_ptr = kmalloc(name_len, GFP_NOFS);
+ name_ptr = kmalloc(name_len, GFP_KERNEL);
if (!name_ptr) {
ret = -ENOMEM;
goto err;
@@ -5864,6 +5841,7 @@ skip:
if (over)
goto nopos;
+ emitted = true;
di_len = btrfs_dir_name_len(leaf, di) +
btrfs_dir_data_len(leaf, di) + sizeof(*di);
di_cur += di_len;
@@ -5876,11 +5854,20 @@ next:
if (key_type == BTRFS_DIR_INDEX_KEY) {
if (is_curr)
ctx->pos++;
- ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+ ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted);
if (ret)
goto nopos;
}
+ /*
+ * If we haven't emitted any dir entry, we must not touch ctx->pos as
+ * it was was set to the termination value in previous call. We assume
+ * that "." and ".." were emitted if we reach this point and set the
+ * termination value as well for an empty directory.
+ */
+ if (ctx->pos > 2 && !emitted)
+ goto nopos;
+
/* Reached end of directory/root. Bump pos past the last item. */
ctx->pos++;
@@ -6200,7 +6187,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
inode_init_owner(inode, dir, mode);
inode_set_bytes(inode, 0);
- inode->i_mtime = CURRENT_TIME;
+ inode->i_mtime = current_fs_time(inode->i_sb);
inode->i_atime = inode->i_mtime;
inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
@@ -6313,7 +6300,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->i_size +
name_len * 2);
inode_inc_iversion(parent_inode);
- parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+ parent_inode->i_mtime = parent_inode->i_ctime =
+ current_fs_time(parent_inode->i_sb);
ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
btrfs_abort_transaction(trans, root, ret);
@@ -6360,9 +6348,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
u64 objectid;
u64 index = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
/*
* 2 for inode item and ref
* 2 for dir items
@@ -6500,7 +6485,7 @@ out_unlock_inode:
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = d_inode(old_dentry);
u64 index;
@@ -6526,6 +6511,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
goto fail;
}
@@ -6533,7 +6519,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
BTRFS_I(inode)->dir_index = 0ULL;
inc_nlink(inode);
inode_inc_iversion(inode);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
@@ -6559,9 +6545,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
btrfs_log_new_name(trans, inode, NULL, parent);
}
- btrfs_end_transaction(trans, root);
btrfs_balance_delayed_items(root);
fail:
+ if (trans)
+ btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -6706,7 +6693,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
}
static noinline int uncompress_inline(struct btrfs_path *path,
- struct inode *inode, struct page *page,
+ struct page *page,
size_t pg_offset, u64 extent_offset,
struct btrfs_file_extent_item *item)
{
@@ -6730,7 +6717,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
- max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+ max_size = min_t(unsigned long, PAGE_SIZE, max_size);
ret = btrfs_decompress(compress_type, tmp, page,
extent_offset, inline_size, max_size);
kfree(tmp);
@@ -6803,7 +6790,7 @@ again:
* Chances are we'll be called again, so go ahead and do
* readahead
*/
- path->reada = 1;
+ path->reada = READA_FORWARD;
}
ret = btrfs_lookup_file_extent(trans, root, path,
@@ -6892,8 +6879,8 @@ next:
size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
extent_offset = page_offset(page) + pg_offset - extent_start;
- copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
- size - extent_offset);
+ copy_size = min_t(u64, PAGE_SIZE - pg_offset,
+ size - extent_offset);
em->start = extent_start + extent_offset;
em->len = ALIGN(copy_size, root->sectorsize);
em->orig_block_len = em->len;
@@ -6902,8 +6889,7 @@ next:
if (create == 0 && !PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
- ret = uncompress_inline(path, inode, page,
- pg_offset,
+ ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
if (ret) {
err = ret;
@@ -6913,9 +6899,9 @@ next:
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
- if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+ if (pg_offset + copy_size < PAGE_SIZE) {
memset(map + pg_offset + copy_size, 0,
- PAGE_CACHE_SIZE - pg_offset -
+ PAGE_SIZE - pg_offset -
copy_size);
}
kunmap(page);
@@ -7158,21 +7144,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
if (ret)
return ERR_PTR(ret);
- em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
- ins.offset, ins.offset, ins.offset, 0);
- if (IS_ERR(em)) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- return em;
- }
-
+ /*
+ * Create the ordered extent before the extent map. This is to avoid
+ * races with the fast fsync path that would lead to it logging file
+ * extent items that point to disk extents that were not yet written to.
+ * The fast fsync path collects ordered extents into a local list and
+ * then collects all the new extent maps, so we must create the ordered
+ * extent first and make sure the fast fsync path collects any new
+ * ordered extents after collecting new extent maps as well.
+ * The fsync path simply can not rely on inode_dio_wait() because it
+ * causes deadlock with AIO.
+ */
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
- free_extent_map(em);
return ERR_PTR(ret);
}
+ em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+ ins.offset, ins.offset, ins.offset, 0);
+ if (IS_ERR(em)) {
+ struct btrfs_ordered_extent *oe;
+
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+ oe = btrfs_lookup_ordered_extent(inode, start);
+ ASSERT(oe);
+ if (WARN_ON(!oe))
+ return em;
+ set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
+ set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
+ btrfs_remove_ordered_extent(inode, oe);
+ /* Once for our lookup and once for the ordered extents tree. */
+ btrfs_put_ordered_extent(oe);
+ btrfs_put_ordered_extent(oe);
+ }
return em;
}
@@ -7330,12 +7336,12 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
int start_idx;
int end_idx;
- start_idx = start >> PAGE_CACHE_SHIFT;
+ start_idx = start >> PAGE_SHIFT;
/*
* end is the last byte in the last page. end == start is legal
*/
- end_idx = end >> PAGE_CACHE_SHIFT;
+ end_idx = end >> PAGE_SHIFT;
rcu_read_lock();
@@ -7376,7 +7382,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
* include/linux/pagemap.h for details.
*/
if (unlikely(page != *pagep)) {
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
}
@@ -7384,7 +7390,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
if (page) {
if (page->index <= end_idx)
found = true;
- page_cache_release(page);
+ put_page(page);
}
rcu_read_unlock();
@@ -7399,7 +7405,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
while (1) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- 0, cached_state);
+ cached_state);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure theres no ordered
@@ -7424,30 +7430,47 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
cached_state, GFP_NOFS);
if (ordered) {
- btrfs_start_ordered_extent(inode, ordered, 1);
+ /*
+ * If we are doing a DIO read and the ordered extent we
+ * found is for a buffered write, we can not wait for it
+ * to complete and retry, because if we do so we can
+ * deadlock with concurrent buffered writes on page
+ * locks. This happens only if our DIO read covers more
+ * than one extent map, if at this point has already
+ * created an ordered extent for a previous extent map
+ * and locked its range in the inode's io tree, and a
+ * concurrent write against that previous extent map's
+ * range and this range started (we unlock the ranges
+ * in the io tree only when the bios complete and
+ * buffered writes always lock pages before attempting
+ * to lock range in the io tree).
+ */
+ if (writing ||
+ test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ else
+ ret = -ENOTBLK;
btrfs_put_ordered_extent(ordered);
} else {
- /* Screw you mmap */
- ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
- if (ret)
- break;
- ret = filemap_fdatawait_range(inode->i_mapping,
- lockstart,
- lockend);
- if (ret)
- break;
-
/*
- * If we found a page that couldn't be invalidated just
- * fall back to buffered.
+ * We could trigger writeback for this range (and wait
+ * for it to complete) and then invalidate the pages for
+ * this range (through invalidate_inode_pages2_range()),
+ * but that can lead us to a deadlock with a concurrent
+ * call to readpages() (a buffered read or a defrag call
+ * triggered a readahead) on a page lock due to an
+ * ordered dio extent we created before but did not have
+ * yet a corresponding bio submitted (whence it can not
+ * complete), which makes readpages() wait for that
+ * ordered extent to complete while holding a lock on
+ * that page.
*/
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- lockstart >> PAGE_CACHE_SHIFT,
- lockend >> PAGE_CACHE_SHIFT);
- if (ret)
- break;
+ ret = -ENOTBLK;
}
+ if (ret)
+ break;
+
cond_resched();
}
@@ -7501,10 +7524,27 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
return em;
}
-struct btrfs_dio_data {
- u64 outstanding_extents;
- u64 reserve;
-};
+static void adjust_dio_outstanding_extents(struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ const u64 len)
+{
+ unsigned num_extents;
+
+ num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ /*
+ * If we have an outstanding_extents count still set then we're
+ * within our reservation, otherwise we need to adjust our inode
+ * counter appropriately.
+ */
+ if (dio_data->outstanding_extents) {
+ dio_data->outstanding_extents -= num_extents;
+ } else {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents += num_extents;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+}
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -7541,8 +7581,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
- return -ENOTBLK;
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
+ create)) {
+ ret = -ENOTBLK;
+ goto err;
+ }
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
@@ -7660,22 +7703,11 @@ unlock:
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- /*
- * If we have an outstanding_extents count still set then we're
- * within our reservation, otherwise we need to adjust our inode
- * counter appropriately.
- */
- if (dio_data->outstanding_extents) {
- (dio_data->outstanding_extents)--;
- } else {
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
- }
-
+ adjust_dio_outstanding_extents(inode, dio_data, len);
btrfs_free_reserved_data_space(inode, start, len);
WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
+ dio_data->unsubmitted_oe_range_end = start + len;
current->journal_info = dio_data;
}
@@ -7699,8 +7731,17 @@ unlock:
unlock_err:
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+err:
if (dio_data)
current->journal_info = dio_data;
+ /*
+ * Compensate the delalloc release we do in btrfs_direct_IO() when we
+ * write less data then expected, so that we don't underflow our inode's
+ * outstanding extents counter.
+ */
+ if (create && dio_data)
+ adjust_dio_outstanding_extents(inode, dio_data, len);
+
return ret;
}
@@ -7760,9 +7801,9 @@ static int btrfs_check_dio_repairable(struct inode *inode,
}
static int dio_read_error(struct inode *inode, struct bio *failed_bio,
- struct page *page, u64 start, u64 end,
- int failed_mirror, bio_end_io_t *repair_endio,
- void *repair_arg)
+ struct page *page, unsigned int pgoff,
+ u64 start, u64 end, int failed_mirror,
+ bio_end_io_t *repair_endio, void *repair_arg)
{
struct io_failure_record *failrec;
struct bio *bio;
@@ -7783,7 +7824,9 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
return -EIO;
}
- if (failed_bio->bi_vcnt > 1)
+ if ((failed_bio->bi_vcnt > 1)
+ || (failed_bio->bi_io_vec->bv_len
+ > BTRFS_I(inode)->root->sectorsize))
read_mode = READ_SYNC | REQ_FAILFAST_DEV;
else
read_mode = READ_SYNC;
@@ -7791,7 +7834,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
isector = start - btrfs_io_bio(failed_bio)->logical;
isector >>= inode->i_sb->s_blocksize_bits;
bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
- 0, isector, repair_endio, repair_arg);
+ pgoff, isector, repair_endio, repair_arg);
if (!bio) {
free_io_failure(inode, failrec);
return -EIO;
@@ -7821,12 +7864,17 @@ struct btrfs_retry_complete {
static void btrfs_retry_endio_nocsum(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
+ struct inode *inode;
struct bio_vec *bvec;
int i;
if (bio->bi_error)
goto end;
+ ASSERT(bio->bi_vcnt == 1);
+ inode = bio->bi_io_vec->bv_page->mapping->host;
+ ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
+
done->uptodate = 1;
bio_for_each_segment_all(bvec, bio, i)
clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
@@ -7838,25 +7886,35 @@ end:
static int __btrfs_correct_data_nocsum(struct inode *inode,
struct btrfs_io_bio *io_bio)
{
+ struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct btrfs_retry_complete done;
u64 start;
+ unsigned int pgoff;
+ u32 sectorsize;
+ int nr_sectors;
int i;
int ret;
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ sectorsize = BTRFS_I(inode)->root->sectorsize;
+
start = io_bio->logical;
done.inode = inode;
bio_for_each_segment_all(bvec, &io_bio->bio, i) {
-try_again:
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+ pgoff = bvec->bv_offset;
+
+next_block_or_try_again:
done.uptodate = 0;
done.start = start;
init_completion(&done.done);
- ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
- start + bvec->bv_len - 1,
- io_bio->mirror_num,
- btrfs_retry_endio_nocsum, &done);
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
+ pgoff, start, start + sectorsize - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio_nocsum, &done);
if (ret)
return ret;
@@ -7864,10 +7922,15 @@ try_again:
if (!done.uptodate) {
/* We might have another mirror, so try again */
- goto try_again;
+ goto next_block_or_try_again;
}
- start += bvec->bv_len;
+ start += sectorsize;
+
+ if (nr_sectors--) {
+ pgoff += sectorsize;
+ goto next_block_or_try_again;
+ }
}
return 0;
@@ -7877,7 +7940,9 @@ static void btrfs_retry_endio(struct bio *bio)
{
struct btrfs_retry_complete *done = bio->bi_private;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+ struct inode *inode;
struct bio_vec *bvec;
+ u64 start;
int uptodate;
int ret;
int i;
@@ -7886,13 +7951,20 @@ static void btrfs_retry_endio(struct bio *bio)
goto end;
uptodate = 1;
+
+ start = done->start;
+
+ ASSERT(bio->bi_vcnt == 1);
+ inode = bio->bi_io_vec->bv_page->mapping->host;
+ ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize);
+
bio_for_each_segment_all(bvec, bio, i) {
ret = __readpage_endio_check(done->inode, io_bio, i,
- bvec->bv_page, 0,
- done->start, bvec->bv_len);
+ bvec->bv_page, bvec->bv_offset,
+ done->start, bvec->bv_len);
if (!ret)
clean_io_failure(done->inode, done->start,
- bvec->bv_page, 0);
+ bvec->bv_page, bvec->bv_offset);
else
uptodate = 0;
}
@@ -7906,20 +7978,34 @@ end:
static int __btrfs_subio_endio_read(struct inode *inode,
struct btrfs_io_bio *io_bio, int err)
{
+ struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct btrfs_retry_complete done;
u64 start;
u64 offset = 0;
+ u32 sectorsize;
+ int nr_sectors;
+ unsigned int pgoff;
+ int csum_pos;
int i;
int ret;
+ fs_info = BTRFS_I(inode)->root->fs_info;
+ sectorsize = BTRFS_I(inode)->root->sectorsize;
+
err = 0;
start = io_bio->logical;
done.inode = inode;
bio_for_each_segment_all(bvec, &io_bio->bio, i) {
- ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
- 0, start, bvec->bv_len);
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+
+ pgoff = bvec->bv_offset;
+next_block:
+ csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
+ ret = __readpage_endio_check(inode, io_bio, csum_pos,
+ bvec->bv_page, pgoff, start,
+ sectorsize);
if (likely(!ret))
goto next;
try_again:
@@ -7927,10 +8013,10 @@ try_again:
done.start = start;
init_completion(&done.done);
- ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
- start + bvec->bv_len - 1,
- io_bio->mirror_num,
- btrfs_retry_endio, &done);
+ ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
+ pgoff, start, start + sectorsize - 1,
+ io_bio->mirror_num,
+ btrfs_retry_endio, &done);
if (ret) {
err = ret;
goto next;
@@ -7943,8 +8029,15 @@ try_again:
goto try_again;
}
next:
- offset += bvec->bv_len;
- start += bvec->bv_len;
+ offset += sectorsize;
+ start += sectorsize;
+
+ ASSERT(nr_sectors);
+
+ if (--nr_sectors) {
+ pgoff += sectorsize;
+ goto next_block;
+ }
}
return err;
@@ -7982,6 +8075,7 @@ static void btrfs_endio_direct_read(struct bio *bio)
kfree(dip);
+ dio_bio->bi_error = bio->bi_error;
dio_end_io(dio_bio, bio->bi_error);
if (io_bio->end_io)
@@ -7989,22 +8083,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
bio_put(bio);
}
-static void btrfs_endio_direct_write(struct bio *bio)
+static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+ const u64 offset,
+ const u64 bytes,
+ const int uptodate)
{
- struct btrfs_dio_private *dip = bio->bi_private;
- struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered = NULL;
- u64 ordered_offset = dip->logical_offset;
- u64 ordered_bytes = dip->bytes;
- struct bio *dio_bio;
+ u64 ordered_offset = offset;
+ u64 ordered_bytes = bytes;
int ret;
again:
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
&ordered_offset,
ordered_bytes,
- !bio->bi_error);
+ uptodate);
if (!ret)
goto out_test;
@@ -8017,16 +8111,26 @@ out_test:
* our bio might span multiple ordered extents. If we haven't
* completed the accounting for the whole dio, go back and try again
*/
- if (ordered_offset < dip->logical_offset + dip->bytes) {
- ordered_bytes = dip->logical_offset + dip->bytes -
- ordered_offset;
+ if (ordered_offset < offset + bytes) {
+ ordered_bytes = offset + bytes - ordered_offset;
ordered = NULL;
goto again;
}
- dio_bio = dip->dio_bio;
+}
+
+static void btrfs_endio_direct_write(struct bio *bio)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct bio *dio_bio = dip->dio_bio;
+
+ btrfs_endio_direct_write_update_ordered(dip->inode,
+ dip->logical_offset,
+ dip->bytes,
+ !bio->bi_error);
kfree(dip);
+ dio_bio->bi_error = bio->bi_error;
dio_end_io(dio_bio, bio->bi_error);
bio_put(bio);
}
@@ -8187,9 +8291,11 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
u64 file_offset = dip->logical_offset;
u64 submit_len = 0;
u64 map_length;
- int nr_pages = 0;
- int ret;
+ u32 blocksize = root->sectorsize;
int async_submit = 0;
+ int nr_sectors;
+ int ret;
+ int i;
map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
@@ -8219,9 +8325,12 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
atomic_inc(&dip->pending_bios);
while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
- if (map_length < submit_len + bvec->bv_len ||
- bio_add_page(bio, bvec->bv_page, bvec->bv_len,
- bvec->bv_offset) < bvec->bv_len) {
+ nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len);
+ i = 0;
+next_block:
+ if (unlikely(map_length < submit_len + blocksize ||
+ bio_add_page(bio, bvec->bv_page, blocksize,
+ bvec->bv_offset + (i * blocksize)) < blocksize)) {
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
@@ -8242,7 +8351,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
file_offset += submit_len;
submit_len = 0;
- nr_pages = 0;
bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
start_sector, GFP_NOFS);
@@ -8260,9 +8368,14 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
bio_put(bio);
goto out_err;
}
+
+ goto next_block;
} else {
- submit_len += bvec->bv_len;
- nr_pages++;
+ submit_len += blocksize;
+ if (--nr_sectors) {
+ i++;
+ goto next_block;
+ }
bvec++;
}
}
@@ -8331,6 +8444,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip->subio_endio = btrfs_subio_endio_read;
}
+ /*
+ * Reset the range for unsubmitted ordered extents (to a 0 length range)
+ * even if we fail to submit a bio, because in such case we do the
+ * corresponding error handling below and it must not be done a second
+ * time by btrfs_direct_IO().
+ */
+ if (write) {
+ struct btrfs_dio_data *dio_data = current->journal_info;
+
+ dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+ dip->bytes;
+ dio_data->unsubmitted_oe_range_start =
+ dio_data->unsubmitted_oe_range_end;
+ }
+
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
if (!ret)
return;
@@ -8359,24 +8487,15 @@ free_ordered:
dip = NULL;
io_bio = NULL;
} else {
- if (write) {
- struct btrfs_ordered_extent *ordered;
-
- ordered = btrfs_lookup_ordered_extent(inode,
- file_offset);
- set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
- /*
- * Decrements our ref on the ordered extent and removes
- * the ordered extent from the inode's ordered tree,
- * doing all the proper resource cleanup such as for the
- * reserved space and waking up any waiters for this
- * ordered extent (through btrfs_remove_ordered_extent).
- */
- btrfs_finish_ordered_io(ordered);
- } else {
+ if (write)
+ btrfs_endio_direct_write_update_ordered(inode,
+ file_offset,
+ dio_bio->bi_iter.bi_size,
+ 0);
+ else
unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
file_offset + dio_bio->bi_iter.bi_size - 1);
- }
+
dio_bio->bi_error = -EIO;
/*
* Releases and cleans up our dio_bio, no need to bio_put()
@@ -8460,7 +8579,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
relock = true;
}
ret = btrfs_delalloc_reserve_space(inode, offset, count);
@@ -8476,6 +8595,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* originally calculated. Abuse current->journal_info for this.
*/
dio_data.reserve = round_up(count, root->sectorsize);
+ dio_data.unsubmitted_oe_range_start = (u64)offset;
+ dio_data.unsubmitted_oe_range_end = (u64)offset;
current->journal_info = &dio_data;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
@@ -8494,6 +8615,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (dio_data.reserve)
btrfs_delalloc_release_space(inode, offset,
dio_data.reserve);
+ /*
+ * On error we might have left some ordered extents
+ * without submitting corresponding bios for them, so
+ * cleanup them up to avoid other tasks getting them
+ * and waiting for them to complete forever.
+ */
+ if (dio_data.unsubmitted_oe_range_start <
+ dio_data.unsubmitted_oe_range_end)
+ btrfs_endio_direct_write_update_ordered(inode,
+ dio_data.unsubmitted_oe_range_start,
+ dio_data.unsubmitted_oe_range_end -
+ dio_data.unsubmitted_oe_range_start,
+ 0);
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret);
@@ -8502,7 +8636,7 @@ out:
if (wakeup)
inode_dio_end(inode);
if (relock)
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
return ret;
}
@@ -8531,15 +8665,28 @@ int btrfs_readpage(struct file *file, struct page *page)
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
-
+ struct inode *inode = page->mapping->host;
+ int ret;
if (current->flags & PF_MEMALLOC) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
+
+ /*
+ * If we are under memory pressure we will call this directly from the
+ * VM, we need to make sure we have the inode referenced for the ordered
+ * extent. If not just return like we didn't do anything.
+ */
+ if (!igrab(inode)) {
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+ btrfs_add_delayed_iput(inode);
+ return ret;
}
static int btrfs_writepages(struct address_space *mapping,
@@ -8572,7 +8719,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
if (ret == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
- page_cache_release(page);
+ put_page(page);
}
return ret;
}
@@ -8592,7 +8739,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
- u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+ u64 page_end = page_start + PAGE_SIZE - 1;
+ u64 start;
+ u64 end;
int inode_evicting = inode->i_state & I_FREEING;
/*
@@ -8611,15 +8760,19 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
}
if (!inode_evicting)
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ lock_extent_bits(tree, page_start, page_end, &cached_state);
+again:
+ start = page_start;
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ page_end - start + 1);
if (ordered) {
+ end = min(page_end, ordered->file_offset + ordered->len - 1);
/*
* IO on this page will never be started, so we need
* to account for any ordered extents now
*/
if (!inode_evicting)
- clear_extent_bit(tree, page_start, page_end,
+ clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 1, 0, &cached_state,
@@ -8636,22 +8789,26 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
spin_lock_irq(&tree->lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
- new_len = page_start - ordered->file_offset;
+ new_len = start - ordered->file_offset;
if (new_len < ordered->truncated_len)
ordered->truncated_len = new_len;
spin_unlock_irq(&tree->lock);
if (btrfs_dec_test_ordered_pending(inode, &ordered,
- page_start,
- PAGE_CACHE_SIZE, 1))
+ start,
+ end - start + 1, 1))
btrfs_finish_ordered_io(ordered);
}
btrfs_put_ordered_extent(ordered);
if (!inode_evicting) {
cached_state = NULL;
- lock_extent_bits(tree, page_start, page_end, 0,
+ lock_extent_bits(tree, start, end,
&cached_state);
}
+
+ start = end + 1;
+ if (start < page_end)
+ goto again;
}
/*
@@ -8665,7 +8822,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
* 2) Not written to disk
* This means the reserved space should be freed here.
*/
- btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
+ btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8680,7 +8837,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
if (PagePrivate(page)) {
ClearPagePrivate(page);
set_page_private(page, 0);
- page_cache_release(page);
+ put_page(page);
}
}
@@ -8712,15 +8869,28 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
loff_t size;
int ret;
int reserved = 0;
+ u64 reserved_space;
u64 page_start;
u64 page_end;
+ u64 end;
+
+ reserved_space = PAGE_SIZE;
sb_start_pagefault(inode->i_sb);
page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
+ page_end = page_start + PAGE_SIZE - 1;
+ end = page_end;
+ /*
+ * Reserving delalloc space after obtaining the page lock can lead to
+ * deadlock. For example, if a dirty page is locked by this function
+ * and the call to btrfs_delalloc_reserve_space() ends up triggering
+ * dirty page write out, then the btrfs_writepage() function could
+ * end up waiting indefinitely to get a lock on the page currently
+ * being processed by btrfs_page_mkwrite() function.
+ */
ret = btrfs_delalloc_reserve_space(inode, page_start,
- PAGE_CACHE_SIZE);
+ reserved_space);
if (!ret) {
ret = file_update_time(vma->vm_file);
reserved = 1;
@@ -8747,14 +8917,14 @@ again:
}
wait_on_page_writeback(page);
- lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+ lock_extent_bits(io_tree, page_start, page_end, &cached_state);
set_page_extent_mapped(page);
/*
* we can't set the delalloc bits if there are pending ordered
* extents. Drop our locks and wait for them to finish
*/
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, page_end);
if (ordered) {
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state, GFP_NOFS);
@@ -8764,6 +8934,18 @@ again:
goto again;
}
+ if (page->index == ((size - 1) >> PAGE_SHIFT)) {
+ reserved_space = round_up(size - page_start, root->sectorsize);
+ if (reserved_space < PAGE_SIZE) {
+ end = page_start + reserved_space - 1;
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ btrfs_delalloc_release_space(inode, page_start,
+ PAGE_SIZE - reserved_space);
+ }
+ }
+
/*
* XXX - page_mkwrite gets called every time the page is dirtied, even
* if it was already dirty, so for space accounting reasons we need to
@@ -8771,12 +8953,12 @@ again:
* is probably a better way to do this, but for now keep consistent with
* prepare_pages in the normal write path.
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, &cached_state, GFP_NOFS);
- ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+ ret = btrfs_set_extent_delalloc(inode, page_start, end,
&cached_state);
if (ret) {
unlock_extent_cached(io_tree, page_start, page_end,
@@ -8787,14 +8969,14 @@ again:
ret = 0;
/* page is wholly or partially inside EOF */
- if (page_start + PAGE_CACHE_SIZE > size)
- zero_start = size & ~PAGE_CACHE_MASK;
+ if (page_start + PAGE_SIZE > size)
+ zero_start = size & ~PAGE_MASK;
else
- zero_start = PAGE_CACHE_SIZE;
+ zero_start = PAGE_SIZE;
- if (zero_start != PAGE_CACHE_SIZE) {
+ if (zero_start != PAGE_SIZE) {
kaddr = kmap(page);
- memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+ memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
flush_dcache_page(page);
kunmap(page);
}
@@ -8815,7 +8997,7 @@ out_unlock:
}
unlock_page(page);
out:
- btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start, reserved_space);
out_noreserve:
sb_end_pagefault(inode->i_sb);
return ret;
@@ -9021,6 +9203,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_log_commit = 0;
+ ei->delayed_iput_count = 0;
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
@@ -9045,6 +9228,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
+ INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
return inode;
@@ -9139,25 +9323,19 @@ void btrfs_destroy_cachep(void)
* destroy cache.
*/
rcu_barrier();
- if (btrfs_inode_cachep)
- kmem_cache_destroy(btrfs_inode_cachep);
- if (btrfs_trans_handle_cachep)
- kmem_cache_destroy(btrfs_trans_handle_cachep);
- if (btrfs_transaction_cachep)
- kmem_cache_destroy(btrfs_transaction_cachep);
- if (btrfs_path_cachep)
- kmem_cache_destroy(btrfs_path_cachep);
- if (btrfs_free_space_cachep)
- kmem_cache_destroy(btrfs_free_space_cachep);
- if (btrfs_delalloc_work_cachep)
- kmem_cache_destroy(btrfs_delalloc_work_cachep);
+ kmem_cache_destroy(btrfs_inode_cachep);
+ kmem_cache_destroy(btrfs_trans_handle_cachep);
+ kmem_cache_destroy(btrfs_transaction_cachep);
+ kmem_cache_destroy(btrfs_path_cachep);
+ kmem_cache_destroy(btrfs_free_space_cachep);
}
int btrfs_init_cachep(void)
{
btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
sizeof(struct btrfs_inode), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ init_once);
if (!btrfs_inode_cachep)
goto fail;
@@ -9185,13 +9363,6 @@ int btrfs_init_cachep(void)
if (!btrfs_free_space_cachep)
goto fail;
- btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
- sizeof(struct btrfs_delalloc_work), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- NULL);
- if (!btrfs_delalloc_work_cachep)
- goto fail;
-
return 0;
fail:
btrfs_destroy_cachep();
@@ -9207,7 +9378,6 @@ static int btrfs_getattr(struct vfsmount *mnt,
generic_fillattr(inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
- stat->blksize = PAGE_CACHE_SIZE;
spin_lock(&BTRFS_I(inode)->lock);
delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
@@ -9225,7 +9395,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
- struct timespec ctime = CURRENT_TIME;
u64 index = 0;
u64 root_objectid;
int ret;
@@ -9322,9 +9491,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
- old_dir->i_ctime = old_dir->i_mtime = ctime;
- new_dir->i_ctime = new_dir->i_mtime = ctime;
- old_inode->i_ctime = ctime;
+ old_dir->i_ctime = old_dir->i_mtime =
+ new_dir->i_ctime = new_dir->i_mtime =
+ old_inode->i_ctime = current_fs_time(old_dir->i_sb);
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
@@ -9349,7 +9518,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode) {
inode_inc_iversion(new_inode);
- new_inode->i_ctime = CURRENT_TIME;
+ new_inode->i_ctime = current_fs_time(new_inode->i_sb);
if (unlikely(btrfs_ino(new_inode) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
root_objectid = BTRFS_I(new_inode)->location.objectid;
@@ -9415,14 +9584,10 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
delalloc_work = container_of(work, struct btrfs_delalloc_work,
work);
inode = delalloc_work->inode;
- if (delalloc_work->wait) {
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- } else {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
- }
if (delalloc_work->delay_iput)
btrfs_add_delayed_iput(inode);
@@ -9432,18 +9597,17 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
}
struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
- int wait, int delay_iput)
+ int delay_iput)
{
struct btrfs_delalloc_work *work;
- work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+ work = kmalloc(sizeof(*work), GFP_NOFS);
if (!work)
return NULL;
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- work->wait = wait;
work->delay_iput = delay_iput;
WARN_ON_ONCE(!inode);
btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
@@ -9455,7 +9619,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
{
wait_for_completion(&work->completion);
- kmem_cache_free(btrfs_delalloc_work_cachep, work);
+ kfree(work);
}
/*
@@ -9491,7 +9655,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
}
spin_unlock(&root->delalloc_lock);
- work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ work = btrfs_alloc_delalloc_work(inode, delay_iput);
if (!work) {
if (delay_iput)
btrfs_add_delayed_iput(inode);
@@ -9633,9 +9797,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
/*
* 2 items for inode item and ref
* 2 items for dir items
+ * 1 item for updating parent inode item
+ * 1 item for the inline extent item
* 1 item for xattr if selinux is on
*/
- trans = btrfs_start_transaction(root, 5);
+ trans = btrfs_start_transaction(root, 7);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -9666,10 +9832,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock_inode;
- err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
- if (err)
- goto out_unlock_inode;
-
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
@@ -9702,10 +9864,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
btrfs_free_path(path);
inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &btrfs_symlink_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
err = btrfs_update_inode(trans, root, inode);
+ /*
+ * Last step, add directory indexes for our symlink inode. This is the
+ * last step to avoid extra cleanup of these indexes if an error happens
+ * elsewhere above.
+ */
+ if (!err)
+ err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
if (err) {
drop_inode = 1;
goto out_unlock_inode;
@@ -9756,7 +9926,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
}
}
- cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+ cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
* If we are severely fragmented we could end up with really
@@ -9826,7 +9996,7 @@ next:
*alloc_hint = ins.objectid + ins.offset;
inode_inc_iversion(inode);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(actual_len > inode->i_size) &&
@@ -9991,7 +10161,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.setattr = btrfs_setattr,
.mknod = btrfs_mknod,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
@@ -10020,7 +10190,7 @@ static const struct file_operations btrfs_dir_file_operations = {
.fsync = btrfs_sync_file,
};
-static struct extent_io_ops btrfs_extent_io_ops = {
+static const struct extent_io_ops btrfs_extent_io_ops = {
.fill_delalloc = run_delalloc_range,
.submit_bio_hook = btrfs_submit_bio_hook,
.merge_bio_hook = btrfs_merge_bio_hook,
@@ -10068,7 +10238,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
@@ -10082,7 +10252,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
@@ -10091,13 +10261,12 @@ static const struct inode_operations btrfs_special_inode_operations = {
};
static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
.setxattr = btrfs_setxattr,
- .getxattr = btrfs_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.update_time = btrfs_update_time,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index da94138eb85e..5a23806ae418 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,8 @@
#include "props.h"
#include "sysfs.h"
#include "qgroup.h"
+#include "tree-log.h"
+#include "compression.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -240,7 +242,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ip_oldflags = ip->flags;
i_oldflags = inode->i_flags;
@@ -347,7 +349,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
btrfs_update_iflags(inode);
inode_inc_iversion(inode);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
ret = btrfs_update_inode(trans, root, inode);
btrfs_end_transaction(trans, root);
@@ -358,7 +360,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mnt_drop_write_file(file);
return ret;
}
@@ -443,7 +445,7 @@ static noinline int create_subvol(struct inode *dir,
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
- struct timespec cur_time = CURRENT_TIME;
+ struct timespec cur_time = current_fs_time(dir->i_sb);
struct inode *inode;
int ret;
int err;
@@ -568,6 +570,10 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
+ mutex_lock(&new_root->objectid_mutex);
+ new_root->highest_objectid = new_dirid;
+ mutex_unlock(&new_root->objectid_mutex);
+
/*
* insert the directory item
*/
@@ -655,22 +661,28 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
+ pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+ if (!pending_snapshot)
+ return -ENOMEM;
+
+ pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
+ GFP_NOFS);
+ pending_snapshot->path = btrfs_alloc_path();
+ if (!pending_snapshot->root_item || !pending_snapshot->path) {
+ ret = -ENOMEM;
+ goto free_pending;
+ }
+
atomic_inc(&root->will_be_snapshoted);
smp_mb__after_atomic();
btrfs_wait_for_no_snapshoting_writes(root);
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
- goto out;
+ goto dec_and_free;
btrfs_wait_ordered_extents(root, -1);
- pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot) {
- ret = -ENOMEM;
- goto out;
- }
-
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
@@ -686,7 +698,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
&pending_snapshot->qgroup_reserved,
false);
if (ret)
- goto free;
+ goto dec_and_free;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -737,11 +749,14 @@ fail:
btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv,
pending_snapshot->qgroup_reserved);
-free:
- kfree(pending_snapshot);
-out:
+dec_and_free:
if (atomic_dec_and_test(&root->will_be_snapshoted))
wake_up_atomic_t(&root->will_be_snapshoted);
+free_pending:
+ kfree(pending_snapshot->root_item);
+ btrfs_free_path(pending_snapshot->path);
+ kfree(pending_snapshot);
+
return ret;
}
@@ -831,10 +846,6 @@ static noinline int btrfs_mksubvol(struct path *parent,
if (IS_ERR(dentry))
goto out_unlock;
- error = -EEXIST;
- if (d_really_is_positive(dentry))
- goto out_dput;
-
error = btrfs_may_create(dir, dentry);
if (error)
goto out_dput;
@@ -868,7 +879,7 @@ out_up_read:
out_dput:
dput(dentry);
out_unlock:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
return error;
}
@@ -887,7 +898,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
u64 end;
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+ em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
read_unlock(&em_tree->lock);
if (em) {
@@ -977,7 +988,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em;
- u64 len = PAGE_CACHE_SIZE;
+ u64 len = PAGE_SIZE;
/*
* hopefully we have this extent in the tree already, try without
@@ -992,7 +1003,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
u64 end = start + len - 1;
/* get the big lock and read metadata off disk */
- lock_extent_bits(io_tree, start, end, 0, &cached);
+ lock_extent_bits(io_tree, start, end, &cached);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
@@ -1016,7 +1027,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
ret = false;
else if ((em->block_start + em->block_len == next->block_start) &&
- (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
+ (em->block_len > SZ_128K && next->block_len > SZ_128K))
ret = false;
free_extent_map(next);
@@ -1113,15 +1124,15 @@ static int cluster_pages_for_defrag(struct inode *inode,
struct extent_io_tree *tree;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
- file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+ file_end = (isize - 1) >> PAGE_SHIFT;
if (!isize || start_index > file_end)
return 0;
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(inode,
- start_index << PAGE_CACHE_SHIFT,
- page_cnt << PAGE_CACHE_SHIFT);
+ start_index << PAGE_SHIFT,
+ page_cnt << PAGE_SHIFT);
if (ret)
return ret;
i_done = 0;
@@ -1137,10 +1148,10 @@ again:
break;
page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
+ page_end = page_start + PAGE_SIZE - 1;
while (1) {
lock_extent_bits(tree, page_start, page_end,
- 0, &cached_state);
+ &cached_state);
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
unlock_extent_cached(tree, page_start, page_end,
@@ -1158,7 +1169,7 @@ again:
*/
if (page->mapping != inode->i_mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto again;
}
}
@@ -1168,7 +1179,7 @@ again:
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ret = -EIO;
break;
}
@@ -1176,7 +1187,7 @@ again:
if (page->mapping != inode->i_mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto again;
}
@@ -1197,10 +1208,10 @@ again:
wait_on_page_writeback(pages[i]);
page_start = page_offset(pages[0]);
- page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+ page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- page_start, page_end - 1, 0, &cached_state);
+ page_start, page_end - 1, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
@@ -1211,8 +1222,8 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode,
- start_index << PAGE_CACHE_SHIFT,
- (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+ start_index << PAGE_SHIFT,
+ (page_cnt - i_done) << PAGE_SHIFT);
}
@@ -1229,17 +1240,17 @@ again:
set_page_extent_mapped(pages[i]);
set_page_dirty(pages[i]);
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
return i_done;
out:
for (i = 0; i < i_done; i++) {
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
btrfs_delalloc_release_space(inode,
- start_index << PAGE_CACHE_SHIFT,
- page_cnt << PAGE_CACHE_SHIFT);
+ start_index << PAGE_SHIFT,
+ page_cnt << PAGE_SHIFT);
return ret;
}
@@ -1262,9 +1273,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
u32 extent_thresh = range->extent_thresh;
- unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
unsigned long cluster = max_cluster;
- u64 new_align = ~((u64)128 * 1024 - 1);
+ u64 new_align = ~((u64)SZ_128K - 1);
struct page **pages = NULL;
if (isize == 0)
@@ -1281,7 +1292,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
if (extent_thresh == 0)
- extent_thresh = 256 * 1024;
+ extent_thresh = SZ_256K;
/*
* if we were not given a file, allocate a readahead
@@ -1306,25 +1317,25 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
/* find the last page to defrag */
if (range->start + range->len > range->start) {
last_index = min_t(u64, isize - 1,
- range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
+ range->start + range->len - 1) >> PAGE_SHIFT;
} else {
- last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (isize - 1) >> PAGE_SHIFT;
}
if (newer_than) {
ret = find_new_extents(root, inode, newer_than,
- &newer_off, 64 * 1024);
+ &newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
/*
* we always align our defrag to help keep
* the extents in the file evenly spaced
*/
- i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+ i = (newer_off & new_align) >> PAGE_SHIFT;
} else
goto out_ra;
} else {
- i = range->start >> PAGE_CACHE_SHIFT;
+ i = range->start >> PAGE_SHIFT;
}
if (!max_to_defrag)
max_to_defrag = last_index - i + 1;
@@ -1337,7 +1348,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
inode->i_mapping->writeback_index = i;
while (i <= last_index && defrag_count < max_to_defrag &&
- (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
+ (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
/*
* make sure we stop running if someone unmounts
* the FS
@@ -1351,7 +1362,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
break;
}
- if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+ if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
extent_thresh, &last_len, &skip,
&defrag_end, range->flags &
BTRFS_DEFRAG_RANGE_COMPRESS)) {
@@ -1360,14 +1371,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
* the should_defrag function tells us how much to skip
* bump our counter by the suggested amount
*/
- next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
+ next = DIV_ROUND_UP(skip, PAGE_SIZE);
i = max(i + 1, next);
continue;
}
if (!newer_than) {
- cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
- PAGE_CACHE_SHIFT) - i;
+ cluster = (PAGE_ALIGN(defrag_end) >>
+ PAGE_SHIFT) - i;
cluster = min(cluster, max_cluster);
} else {
cluster = max_cluster;
@@ -1380,18 +1391,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra_index += cluster;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = compress_type;
ret = cluster_pages_for_defrag(inode, pages, i, cluster);
if (ret < 0) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out_ra;
}
defrag_count += ret;
balance_dirty_pages_ratelimited(inode->i_mapping);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (newer_than) {
if (newer_off == (u64)-1)
@@ -1401,21 +1412,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i += ret;
newer_off = max(newer_off + 1,
- (u64)i << PAGE_CACHE_SHIFT);
+ (u64)i << PAGE_SHIFT);
- ret = find_new_extents(root, inode,
- newer_than, &newer_off,
- 64 * 1024);
+ ret = find_new_extents(root, inode, newer_than,
+ &newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
- i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+ i = (newer_off & new_align) >> PAGE_SHIFT;
} else {
break;
}
} else {
if (ret > 0) {
i += ret;
- last_len += ret << PAGE_CACHE_SHIFT;
+ last_len += ret << PAGE_SHIFT;
} else {
i++;
last_len = 0;
@@ -1453,9 +1463,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
out_ra:
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
if (!file)
kfree(ra);
@@ -1571,7 +1581,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = old_size + new_size;
}
- if (new_size < 256 * 1024 * 1024) {
+ if (new_size < SZ_256M) {
ret = -EINVAL;
goto out_free;
}
@@ -1644,7 +1654,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
src_inode = file_inode(src.file);
if (src_inode->i_sb != file_inode(file)->i_sb) {
- btrfs_info(BTRFS_I(src_inode)->root->fs_info,
+ btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
ret = -EXDEV;
} else if (!inode_owner_or_capable(src_inode)) {
@@ -1712,7 +1722,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
- if (vol_args->size > PAGE_CACHE_SIZE) {
+ if (vol_args->size > PAGE_SIZE) {
ret = -EINVAL;
goto free_args;
}
@@ -2085,8 +2095,6 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- btrfs_err(info, "could not find root %llu",
- sk->tree_id);
btrfs_free_path(path);
return -ENOENT;
}
@@ -2160,7 +2168,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
struct inode *inode;
int ret;
size_t buf_size;
- const size_t buf_limit = 16 * 1024 * 1024;
+ const size_t buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2418,7 +2426,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* Don't allow to delete a subvolume with send in progress. This is
@@ -2464,6 +2472,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
+ btrfs_record_snapshot_destroy(trans, dir);
+
ret = btrfs_unlink_subvol(trans, root, dir,
dest->root_key.objectid,
dentry->d_name.name,
@@ -2531,7 +2541,7 @@ out_up_write:
spin_unlock(&dest->root_item_lock);
}
out_unlock_inode:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!err) {
d_invalidate(dentry);
btrfs_invalidate_inodes(dest);
@@ -2547,7 +2557,7 @@ out_unlock_inode:
out_dput:
dput(dentry);
out_unlock_dir:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
out_drop_write:
mnt_drop_write_file(file);
out:
@@ -2782,24 +2792,29 @@ out:
static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
{
struct page *page;
- struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
page = grab_cache_page(inode->i_mapping, index);
if (!page)
- return NULL;
+ return ERR_PTR(-ENOMEM);
if (!PageUptodate(page)) {
- if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
- 0))
- return NULL;
+ int ret;
+
+ ret = btrfs_readpage(NULL, page);
+ if (ret)
+ return ERR_PTR(ret);
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
- page_cache_release(page);
- return NULL;
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EAGAIN);
}
}
- unlock_page(page);
return page;
}
@@ -2808,20 +2823,34 @@ static int gather_extent_pages(struct inode *inode, struct page **pages,
int num_pages, u64 off)
{
int i;
- pgoff_t index = off >> PAGE_CACHE_SHIFT;
+ pgoff_t index = off >> PAGE_SHIFT;
for (i = 0; i < num_pages; i++) {
+again:
pages[i] = extent_same_get_page(inode, index + i);
- if (!pages[i])
- return -ENOMEM;
+ if (IS_ERR(pages[i])) {
+ int err = PTR_ERR(pages[i]);
+
+ if (err == -EAGAIN)
+ goto again;
+ pages[i] = NULL;
+ return err;
+ }
}
return 0;
}
-static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+static int lock_extent_range(struct inode *inode, u64 off, u64 len,
+ bool retry_range_locking)
{
- /* do any pending delalloc/csum calc on src, one way or
- another, and lock file content */
+ /*
+ * Do any pending delalloc/csum calculations on inode, one way or
+ * another, and lock file content.
+ * The locking order is:
+ *
+ * 1) pages
+ * 2) range in the inode's io tree
+ */
while (1) {
struct btrfs_ordered_extent *ordered;
lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
@@ -2839,14 +2868,17 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
if (ordered)
btrfs_put_ordered_extent(ordered);
+ if (!retry_range_locking)
+ return -EAGAIN;
btrfs_wait_ordered_range(inode, off, len);
}
+ return 0;
}
static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
{
- mutex_unlock(&inode1->i_mutex);
- mutex_unlock(&inode2->i_mutex);
+ inode_unlock(inode1);
+ inode_unlock(inode2);
}
static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
@@ -2854,8 +2886,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
if (inode1 < inode2)
swap(inode1, inode2);
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(inode1, I_MUTEX_PARENT);
+ inode_lock_nested(inode2, I_MUTEX_CHILD);
}
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
@@ -2865,15 +2897,24 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
-static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
- struct inode *inode2, u64 loff2, u64 len)
+static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
+ struct inode *inode2, u64 loff2, u64 len,
+ bool retry_range_locking)
{
+ int ret;
+
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
}
- lock_extent_range(inode1, loff1, len);
- lock_extent_range(inode2, loff2, len);
+ ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
+ if (ret)
+ return ret;
+ ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
+ if (ret)
+ unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
+ loff1 + len - 1);
+ return ret;
}
struct cmp_pages {
@@ -2889,11 +2930,15 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
for (i = 0; i < cmp->num_pages; i++) {
pg = cmp->src_pages[i];
- if (pg)
- page_cache_release(pg);
+ if (pg) {
+ unlock_page(pg);
+ put_page(pg);
+ }
pg = cmp->dst_pages[i];
- if (pg)
- page_cache_release(pg);
+ if (pg) {
+ unlock_page(pg);
+ put_page(pg);
+ }
}
kfree(cmp->src_pages);
kfree(cmp->dst_pages);
@@ -2904,7 +2949,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
u64 len, struct cmp_pages *cmp)
{
int ret;
- int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+ int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
struct page **src_pgarr, **dst_pgarr;
/*
@@ -2913,8 +2958,8 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
* of the array is bounded by len, which is in turn bounded by
* BTRFS_MAX_DEDUPE_LEN.
*/
- src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
- dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS);
+ src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
+ dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
if (!src_pgarr || !dst_pgarr) {
kfree(src_pgarr);
kfree(dst_pgarr);
@@ -2942,18 +2987,20 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
int ret = 0;
int i;
struct page *src_page, *dst_page;
- unsigned int cmp_len = PAGE_CACHE_SIZE;
+ unsigned int cmp_len = PAGE_SIZE;
void *addr, *dst_addr;
i = 0;
while (len) {
- if (len < PAGE_CACHE_SIZE)
+ if (len < PAGE_SIZE)
cmp_len = len;
BUG_ON(i >= cmp->num_pages);
src_page = cmp->src_pages[i];
dst_page = cmp->dst_pages[i];
+ ASSERT(PageLocked(src_page));
+ ASSERT(PageLocked(dst_page));
addr = kmap_atomic(src_page);
dst_addr = kmap_atomic(dst_page);
@@ -2962,7 +3009,7 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
flush_dcache_page(dst_page);
if (memcmp(addr, dst_addr, cmp_len))
- ret = BTRFS_SAME_DATA_DIFFERS;
+ ret = -EBADE;
kunmap_atomic(addr);
kunmap_atomic(dst_addr);
@@ -3014,11 +3061,14 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
return 0;
if (same_inode) {
- mutex_lock(&src->i_mutex);
+ inode_lock(src);
ret = extent_same_check_offsets(src, loff, &len, olen);
if (ret)
goto out_unlock;
+ ret = extent_same_check_offsets(src, dst_loff, &len, olen);
+ if (ret)
+ goto out_unlock;
/*
* Single inode case wants the same checks, except we
@@ -3066,14 +3116,46 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
goto out_unlock;
}
+again:
ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
if (ret)
goto out_unlock;
if (same_inode)
- lock_extent_range(src, same_lock_start, same_lock_len);
+ ret = lock_extent_range(src, same_lock_start, same_lock_len,
+ false);
else
- btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+ ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
+ false);
+ /*
+ * If one of the inodes has dirty pages in the respective range or
+ * ordered extents, we need to flush dellaloc and wait for all ordered
+ * extents in the range. We must unlock the pages and the ranges in the
+ * io trees to avoid deadlocks when flushing delalloc (requires locking
+ * pages) and when waiting for ordered extents to complete (they require
+ * range locking).
+ */
+ if (ret == -EAGAIN) {
+ /*
+ * Ranges in the io trees already unlocked. Now unlock all
+ * pages before waiting for all IO to complete.
+ */
+ btrfs_cmp_data_free(&cmp);
+ if (same_inode) {
+ btrfs_wait_ordered_range(src, same_lock_start,
+ same_lock_len);
+ } else {
+ btrfs_wait_ordered_range(src, loff, len);
+ btrfs_wait_ordered_range(dst, dst_loff, len);
+ }
+ goto again;
+ }
+ ASSERT(ret == 0);
+ if (WARN_ON(ret)) {
+ /* ranges in the io trees already unlocked */
+ btrfs_cmp_data_free(&cmp);
+ return ret;
+ }
/* pass original length for comparison so we stay within i_size */
ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
@@ -3089,121 +3171,39 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
btrfs_cmp_data_free(&cmp);
out_unlock:
if (same_inode)
- mutex_unlock(&src->i_mutex);
+ inode_unlock(src);
else
btrfs_double_inode_unlock(src, dst);
return ret;
}
-#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+#define BTRFS_MAX_DEDUPE_LEN SZ_16M
-static long btrfs_ioctl_file_extent_same(struct file *file,
- struct btrfs_ioctl_same_args __user *argp)
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff)
{
- struct btrfs_ioctl_same_args *same = NULL;
- struct btrfs_ioctl_same_extent_info *info;
- struct inode *src = file_inode(file);
- u64 off;
- u64 len;
- int i;
- int ret;
- unsigned long size;
+ struct inode *src = file_inode(src_file);
+ struct inode *dst = file_inode(dst_file);
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
- bool is_admin = capable(CAP_SYS_ADMIN);
- u16 count;
+ ssize_t res;
- if (!(file->f_mode & FMODE_READ))
- return -EINVAL;
+ if (olen > BTRFS_MAX_DEDUPE_LEN)
+ olen = BTRFS_MAX_DEDUPE_LEN;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- if (get_user(count, &argp->dest_count)) {
- ret = -EFAULT;
- goto out;
- }
-
- size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
-
- same = memdup_user(argp, size);
-
- if (IS_ERR(same)) {
- ret = PTR_ERR(same);
- same = NULL;
- goto out;
- }
-
- off = same->logical_offset;
- len = same->length;
-
- /*
- * Limit the total length we will dedupe for each operation.
- * This is intended to bound the total time spent in this
- * ioctl to something sane.
- */
- if (len > BTRFS_MAX_DEDUPE_LEN)
- len = BTRFS_MAX_DEDUPE_LEN;
-
- if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
+ if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
/*
* Btrfs does not support blocksize < page_size. As a
* result, btrfs_cmp_data() won't correctly handle
* this situation without an update.
*/
- ret = -EINVAL;
- goto out;
- }
-
- ret = -EISDIR;
- if (S_ISDIR(src->i_mode))
- goto out;
-
- ret = -EACCES;
- if (!S_ISREG(src->i_mode))
- goto out;
-
- /* pre-format output fields to sane values */
- for (i = 0; i < count; i++) {
- same->info[i].bytes_deduped = 0ULL;
- same->info[i].status = 0;
- }
-
- for (i = 0, info = same->info; i < count; i++, info++) {
- struct inode *dst;
- struct fd dst_file = fdget(info->fd);
- if (!dst_file.file) {
- info->status = -EBADF;
- continue;
- }
- dst = file_inode(dst_file.file);
-
- if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
- info->status = -EINVAL;
- } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
- info->status = -EXDEV;
- } else if (S_ISDIR(dst->i_mode)) {
- info->status = -EISDIR;
- } else if (!S_ISREG(dst->i_mode)) {
- info->status = -EACCES;
- } else {
- info->status = btrfs_extent_same(src, off, len, dst,
- info->logical_offset);
- if (info->status == 0)
- info->bytes_deduped += len;
- }
- fdput(dst_file);
+ return -EINVAL;
}
- ret = copy_to_user(argp, same, size);
- if (ret)
- ret = -EFAULT;
-
-out:
- mnt_drop_write_file(file);
- kfree(same);
- return ret;
+ res = btrfs_extent_same(src, loff, olen, dst, dst_loff);
+ if (res)
+ return res;
+ return olen;
}
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
@@ -3218,7 +3218,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
inode_inc_iversion(inode);
if (!no_time_update)
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
/*
* We round up to the block size at eof when determining which
* extents to clone above, but shouldn't round up the file size.
@@ -3478,7 +3478,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
return ret;
}
- path->reada = 2;
+ path->reada = READA_FORWARD;
/* clone data */
key.objectid = btrfs_ino(src);
key.type = BTRFS_EXTENT_DATA_KEY;
@@ -3779,17 +3779,16 @@ out:
return ret;
}
-static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
+ u64 off, u64 olen, u64 destoff)
{
struct inode *inode = file_inode(file);
+ struct inode *src = file_inode(file_src);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct fd src_file;
- struct inode *src;
int ret;
u64 len = olen;
u64 bs = root->fs_info->sb->s_blocksize;
- int same_inode = 0;
+ int same_inode = src == inode;
/*
* TODO:
@@ -3802,54 +3801,25 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* be either compressed or non-compressed.
*/
- /* the destination must be opened for writing */
- if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
- return -EINVAL;
-
if (btrfs_root_readonly(root))
return -EROFS;
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- src_file = fdget(srcfd);
- if (!src_file.file) {
- ret = -EBADF;
- goto out_drop_write;
- }
-
- ret = -EXDEV;
- if (src_file.file->f_path.mnt != file->f_path.mnt)
- goto out_fput;
-
- src = file_inode(src_file.file);
-
- ret = -EINVAL;
- if (src == inode)
- same_inode = 1;
-
- /* the src must be open for reading */
- if (!(src_file.file->f_mode & FMODE_READ))
- goto out_fput;
+ if (file_src->f_path.mnt != file->f_path.mnt ||
+ src->i_sb != inode->i_sb)
+ return -EXDEV;
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- goto out_fput;
+ return -EINVAL;
- ret = -EISDIR;
if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
- goto out_fput;
-
- ret = -EXDEV;
- if (src->i_sb != inode->i_sb)
- goto out_fput;
+ return -EISDIR;
if (!same_inode) {
btrfs_double_inode_lock(src, inode);
} else {
- mutex_lock(&src->i_mutex);
+ inode_lock(src);
}
/* determine range to clone */
@@ -3895,9 +3865,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
u64 lock_start = min_t(u64, off, destoff);
u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
- lock_extent_range(src, lock_start, lock_len);
+ ret = lock_extent_range(src, lock_start, lock_len, true);
} else {
- btrfs_double_extent_lock(src, off, inode, destoff, len);
+ ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
+ true);
+ }
+ ASSERT(ret == 0);
+ if (WARN_ON(ret)) {
+ /* ranges in the io trees already unlocked */
+ goto out_unlock;
}
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
@@ -3914,28 +3890,33 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* Truncate page cache pages so that future reads will see the cloned
* data immediately and not the previous data.
*/
- truncate_inode_pages_range(&inode->i_data, destoff,
- PAGE_CACHE_ALIGN(destoff + len) - 1);
+ truncate_inode_pages_range(&inode->i_data,
+ round_down(destoff, PAGE_SIZE),
+ round_up(destoff + len, PAGE_SIZE) - 1);
out_unlock:
if (!same_inode)
btrfs_double_inode_unlock(src, inode);
else
- mutex_unlock(&src->i_mutex);
-out_fput:
- fdput(src_file);
-out_drop_write:
- mnt_drop_write_file(file);
+ inode_unlock(src);
return ret;
}
-static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
{
- struct btrfs_ioctl_clone_range_args args;
+ ssize_t ret;
- if (copy_from_user(&args, argp, sizeof(args)))
- return -EFAULT;
- return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
- args.src_length, args.dest_offset);
+ ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
+ if (ret == 0)
+ ret = len;
+ return ret;
+}
+
+int btrfs_clone_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, u64 len)
+{
+ return btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
/*
@@ -4143,11 +4124,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
/* we generally have at most 6 or so space infos, one for each raid
* level. So, a whole page should be more than enough for everyone
*/
- if (alloc_size > PAGE_CACHE_SIZE)
+ if (alloc_size > PAGE_SIZE)
return -ENOMEM;
space_args.total_spaces = 0;
- dest = kmalloc(alloc_size, GFP_NOFS);
+ dest = kmalloc(alloc_size, GFP_KERNEL);
if (!dest)
return -ENOMEM;
dest_orig = dest;
@@ -4524,7 +4505,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
goto out;
}
- size = min_t(u32, loi->size, 64 * 1024);
+ size = min_t(u32, loi->size, SZ_64K);
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
@@ -4673,7 +4654,7 @@ locked:
goto out_bargs;
}
- bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
goto out_bargs;
@@ -4759,7 +4740,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
goto out;
}
- bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+ bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
if (!bargs) {
ret = -ENOMEM;
goto out;
@@ -5019,7 +5000,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+ qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
if (!qsa)
return -ENOMEM;
@@ -5052,7 +5033,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root_item *root_item = &root->root_item;
struct btrfs_trans_handle *trans;
- struct timespec ct = CURRENT_TIME;
+ struct timespec ct = current_fs_time(inode->i_sb);
int ret = 0;
int received_uuid_changed;
@@ -5149,7 +5130,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
goto out;
}
- args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+ args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
if (!args64) {
ret = -ENOMEM;
goto out;
@@ -5283,10 +5264,9 @@ out_unlock:
.compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
.incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
-static int btrfs_ioctl_get_supported_features(struct file *file,
- void __user *arg)
+int btrfs_ioctl_get_supported_features(void __user *arg)
{
- static struct btrfs_ioctl_feature_flags features[3] = {
+ static const struct btrfs_ioctl_feature_flags features[3] = {
INIT_FEATURE_FLAGS(SUPP),
INIT_FEATURE_FLAGS(SAFE_SET),
INIT_FEATURE_FLAGS(SAFE_CLEAR)
@@ -5485,10 +5465,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
- case BTRFS_IOC_CLONE:
- return btrfs_ioctl_clone(file, arg, 0, 0, 0);
- case BTRFS_IOC_CLONE_RANGE:
- return btrfs_ioctl_clone_range(file, argp);
case BTRFS_IOC_TRANS_START:
return btrfs_ioctl_trans_start(file);
case BTRFS_IOC_TRANS_END:
@@ -5566,10 +5542,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_fslabel(file, argp);
case BTRFS_IOC_SET_FSLABEL:
return btrfs_ioctl_set_fslabel(file, argp);
- case BTRFS_IOC_FILE_EXTENT_SAME:
- return btrfs_ioctl_file_extent_same(file, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
- return btrfs_ioctl_get_supported_features(file, argp);
+ return btrfs_ioctl_get_supported_features(argp);
case BTRFS_IOC_GET_FEATURES:
return btrfs_ioctl_get_features(file, argp);
case BTRFS_IOC_SET_FEATURES:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 8077461fc56a..d13128c70ddd 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -56,7 +56,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
atomic_dec(&eb->spinning_readers);
read_unlock(&eb->lock);
}
- return;
}
/*
@@ -96,7 +95,6 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
}
- return;
}
/*
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index a2f051347731..1adfbe7be6b8 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -55,8 +55,8 @@ static struct list_head *lzo_alloc_workspace(void)
return ERR_PTR(-ENOMEM);
workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
- workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
- workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+ workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
+ workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -116,7 +116,7 @@ static int lzo_compress_pages(struct list_head *ws,
*total_out = 0;
*total_in = 0;
- in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ in_page = find_get_page(mapping, start >> PAGE_SHIFT);
data_in = kmap(in_page);
/*
@@ -133,10 +133,10 @@ static int lzo_compress_pages(struct list_head *ws,
tot_out = LZO_LEN;
pages[0] = out_page;
nr_pages = 1;
- pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+ pg_bytes_left = PAGE_SIZE - LZO_LEN;
/* compress at most one page of data each time */
- in_len = min(len, PAGE_CACHE_SIZE);
+ in_len = min(len, PAGE_SIZE);
while (tot_in < len) {
ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
&out_len, workspace->mem);
@@ -201,7 +201,7 @@ static int lzo_compress_pages(struct list_head *ws,
cpage_out = kmap(out_page);
pages[nr_pages++] = out_page;
- pg_bytes_left = PAGE_CACHE_SIZE;
+ pg_bytes_left = PAGE_SIZE;
out_offset = 0;
}
}
@@ -221,12 +221,12 @@ static int lzo_compress_pages(struct list_head *ws,
bytes_left = len - tot_in;
kunmap(in_page);
- page_cache_release(in_page);
+ put_page(in_page);
- start += PAGE_CACHE_SIZE;
- in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ start += PAGE_SIZE;
+ in_page = find_get_page(mapping, start >> PAGE_SHIFT);
data_in = kmap(in_page);
- in_len = min(bytes_left, PAGE_CACHE_SIZE);
+ in_len = min(bytes_left, PAGE_SIZE);
}
if (tot_out > tot_in)
@@ -248,7 +248,7 @@ out:
if (in_page) {
kunmap(in_page);
- page_cache_release(in_page);
+ put_page(in_page);
}
return ret;
@@ -266,7 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
char *data_in;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long buf_offset = 0;
unsigned long bytes;
@@ -289,7 +289,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
tot_in = LZO_LEN;
in_offset = LZO_LEN;
tot_len = min_t(size_t, srclen, tot_len);
- in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+ in_page_bytes_left = PAGE_SIZE - LZO_LEN;
tot_out = 0;
pg_offset = 0;
@@ -345,12 +345,12 @@ cont:
data_in = kmap(pages_in[++page_in_index]);
- in_page_bytes_left = PAGE_CACHE_SIZE;
+ in_page_bytes_left = PAGE_SIZE;
in_offset = 0;
}
}
- out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+ out_len = lzo1x_worst_compress(PAGE_SIZE);
ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
&out_len);
if (need_unmap)
@@ -399,7 +399,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
in_len = read_compress_length(data_in);
data_in += LZO_LEN;
- out_len = PAGE_CACHE_SIZE;
+ out_len = PAGE_SIZE;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
printk(KERN_WARNING "BTRFS: decompress failed!\n");
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8c27292ea9ea..0de7da5a610d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,7 @@
#include "btrfs_inode.h"
#include "extent_io.h"
#include "disk-io.h"
+#include "compression.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@@ -1009,7 +1010,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
for (; node; node = rb_prev(node)) {
test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- /* We treat this entry as if it doesnt exist */
+ /* We treat this entry as if it doesn't exist */
if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
continue;
if (test->file_offset + test->len <= disk_i_size)
@@ -1114,6 +1115,5 @@ int __init ordered_data_init(void)
void ordered_data_exit(void)
{
- if (btrfs_ordered_extent_cache)
- kmem_cache_destroy(btrfs_ordered_extent_cache);
+ kmem_cache_destroy(btrfs_ordered_extent_cache);
}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 647ab12fdf5d..147dc6ca5de1 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -295,8 +295,27 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
btrfs_dev_extent_chunk_offset(l, dev_extent),
btrfs_dev_extent_length(l, dev_extent));
break;
- case BTRFS_DEV_STATS_KEY:
- printk(KERN_INFO "\t\tdevice stats\n");
+ case BTRFS_PERSISTENT_ITEM_KEY:
+ printk(KERN_INFO "\t\tpersistent item objectid %llu offset %llu\n",
+ key.objectid, key.offset);
+ switch (key.objectid) {
+ case BTRFS_DEV_STATS_OBJECTID:
+ printk(KERN_INFO "\t\tdevice stats\n");
+ break;
+ default:
+ printk(KERN_INFO "\t\tunknown persistent item\n");
+ }
+ break;
+ case BTRFS_TEMPORARY_ITEM_KEY:
+ printk(KERN_INFO "\t\ttemporary item objectid %llu offset %llu\n",
+ key.objectid, key.offset);
+ switch (key.objectid) {
+ case BTRFS_BALANCE_OBJECTID:
+ printk(KERN_INFO "\t\tbalance status\n");
+ break;
+ default:
+ printk(KERN_INFO "\t\tunknown temporary item\n");
+ }
break;
case BTRFS_DEV_REPLACE_KEY:
printk(KERN_INFO "\t\tdev replace\n");
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index f9e60231f685..36992128c746 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -22,6 +22,7 @@
#include "hash.h"
#include "transaction.h"
#include "xattr.h"
+#include "compression.h"
#define BTRFS_PROP_HANDLERS_HT_BITS 8
static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 46476c226395..9e119552ed32 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -993,9 +993,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
- spin_lock(&fs_info->qgroup_lock);
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
+ btrfs_qgroup_wait_for_completion(fs_info);
+ spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -1461,6 +1462,9 @@ struct btrfs_qgroup_extent_record
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
+ assert_spin_locked(&delayed_refs->lock);
+ trace_btrfs_qgroup_insert_dirty_extent(record);
+
while (*p) {
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
@@ -1591,6 +1595,9 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+ trace_qgroup_update_counters(qg->qgroupid, cur_old_count,
+ cur_new_count);
+
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes;
@@ -1680,6 +1687,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
goto out_free;
BUG_ON(!fs_info->quota_root);
+ trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots,
+ nr_new_roots);
+
qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) {
ret = -ENOMEM;
@@ -1749,6 +1759,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
+ trace_btrfs_qgroup_account_extents(record);
+
if (!ret) {
/*
* Use (u64)-1 as time_seq to do special search, which
@@ -1839,8 +1851,10 @@ out:
}
/*
- * copy the acounting information between qgroups. This is necessary when a
- * snapshot or a subvolume is created
+ * Copy the acounting information between qgroups. This is necessary
+ * when a snapshot or a subvolume is created. Throwing an error will
+ * cause a transaction abort so we take extra care here to only error
+ * when a readonly fs is a reasonable outcome.
*/
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
@@ -1870,15 +1884,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2 * inherit->num_excl_copies;
for (i = 0; i < nums; ++i) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
- if (!srcgroup) {
- ret = -EINVAL;
- goto out;
- }
- if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
- ret = -EINVAL;
- goto out;
- }
+ /*
+ * Zero out invalid groups so we can ignore
+ * them later.
+ */
+ if (!srcgroup ||
+ ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
+ *i_qgroups = 0ULL;
+
++i_qgroups;
}
}
@@ -1913,17 +1927,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
*/
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
- for (i = 0; i < inherit->num_qgroups; ++i) {
+ for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
+ if (*i_qgroups == 0)
+ continue;
ret = add_qgroup_relation_item(trans, quota_root,
objectid, *i_qgroups);
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
ret = add_qgroup_relation_item(trans, quota_root,
*i_qgroups, objectid);
- if (ret)
+ if (ret && ret != -EEXIST)
goto out;
- ++i_qgroups;
}
+ ret = 0;
}
@@ -1984,17 +2000,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
- ret = add_relation_rb(quota_root->fs_info, objectid,
- *i_qgroups);
- if (ret)
- goto unlock;
+ if (*i_qgroups) {
+ ret = add_relation_rb(quota_root->fs_info, objectid,
+ *i_qgroups);
+ if (ret)
+ goto unlock;
+ }
++i_qgroups;
}
- for (i = 0; i < inherit->num_ref_copies; ++i) {
+ for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
+ if (!i_qgroups[0] || !i_qgroups[1])
+ continue;
+
src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
@@ -2005,12 +2026,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dst->rfer = src->rfer - level_size;
dst->rfer_cmpr = src->rfer_cmpr - level_size;
- i_qgroups += 2;
}
- for (i = 0; i < inherit->num_excl_copies; ++i) {
+ for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
struct btrfs_qgroup *dst;
+ if (!i_qgroups[0] || !i_qgroups[1])
+ continue;
+
src = find_qgroup_rb(fs_info, i_qgroups[0]);
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
@@ -2021,7 +2044,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
dst->excl = src->excl + level_size;
dst->excl_cmpr = src->excl_cmpr + level_size;
- i_qgroups += 2;
}
unlock:
@@ -2198,7 +2220,6 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
int slot;
int ret;
- path->leave_spinning = 1;
mutex_lock(&fs_info->qgroup_rescan_lock);
ret = btrfs_search_slot_for_read(fs_info->extent_root,
&fs_info->qgroup_rescan_progress,
@@ -2286,7 +2307,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
goto out;
err = 0;
- while (!err) {
+ while (!err && !btrfs_fs_closing(fs_info)) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
@@ -2307,7 +2328,8 @@ out:
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (!btrfs_fs_closing(fs_info))
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
@@ -2336,7 +2358,9 @@ out:
}
btrfs_end_transaction(trans, fs_info->quota_root);
- if (err >= 0) {
+ if (btrfs_fs_closing(fs_info)) {
+ btrfs_info(fs_info, "qgroup scan paused");
+ } else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
} else {
@@ -2384,12 +2408,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+ init_completion(&fs_info->qgroup_rescan_completion);
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- init_completion(&fs_info->qgroup_rescan_completion);
-
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1a33d3eb36de..0b7792e02dd5 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -270,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
s = kmap(rbio->bio_pages[i]);
d = kmap(rbio->stripe_pages[i]);
- memcpy(d, s, PAGE_CACHE_SIZE);
+ memcpy(d, s, PAGE_SIZE);
kunmap(rbio->bio_pages[i]);
kunmap(rbio->stripe_pages[i]);
@@ -503,7 +503,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
}
spin_unlock_irqrestore(&table->cache_lock, flags);
- return;
}
/*
@@ -610,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
return 1;
}
+static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
+ int index)
+{
+ return stripe * rbio->stripe_npages + index;
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
+ int index)
+{
+ return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
+}
+
/*
* helper to index into the pstripe
*/
static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
{
- index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
- return rbio->stripe_pages[index];
+ return rbio_stripe_page(rbio, rbio->nr_data, index);
}
/*
@@ -627,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
{
if (rbio->nr_data + 1 == rbio->real_stripes)
return NULL;
-
- index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
- PAGE_CACHE_SHIFT;
- return rbio->stripe_pages[index];
+ return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
}
/*
@@ -890,6 +901,7 @@ static void raid_write_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
int err = bio->bi_error;
+ int max_errors;
if (err)
fail_bio_stripe(rbio, bio);
@@ -902,11 +914,12 @@ static void raid_write_end_io(struct bio *bio)
err = 0;
/* OK, we have read all the stripes we need to. */
- if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+ max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
+ 0 : rbio->bbio->max_errors;
+ if (atomic_read(&rbio->error) > max_errors)
err = -EIO;
rbio_orig_end_io(rbio, err);
- return;
}
/*
@@ -949,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
*/
static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
{
- unsigned long nr = stripe_len * nr_stripes;
- return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
+ return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
}
/*
@@ -968,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
void *p;
rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
- DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
- GFP_NOFS);
+ DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) *
+ sizeof(long), GFP_NOFS);
if (!rbio)
return ERR_PTR(-ENOMEM);
@@ -1023,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
if (!page)
return -ENOMEM;
rbio->stripe_pages[i] = page;
- ClearPageUptodate(page);
}
return 0;
}
-/* allocate pages for just the p/q stripes */
+/* only allocate pages for p/q stripes */
static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
{
int i;
struct page *page;
- i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+ i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
for (; i < rbio->nr_pages; i++) {
if (rbio->stripe_pages[i])
@@ -1067,7 +1078,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
u64 disk_start;
stripe = &rbio->bbio->stripes[stripe_nr];
- disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
+ disk_start = stripe->physical + (page_index << PAGE_SHIFT);
/* if the device is missing, just fail this stripe */
if (!stripe->dev->bdev)
@@ -1085,8 +1096,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
if (last_end == disk_start && stripe->dev->bdev &&
!last->bi_error &&
last->bi_bdev == stripe->dev->bdev) {
- ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
- if (ret == PAGE_CACHE_SIZE)
+ ret = bio_add_page(last, page, PAGE_SIZE, 0);
+ if (ret == PAGE_SIZE)
return 0;
}
}
@@ -1100,7 +1111,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
bio->bi_bdev = stripe->dev->bdev;
bio->bi_iter.bi_sector = disk_start >> 9;
- bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+ bio_add_page(bio, page, PAGE_SIZE, 0);
bio_list_add(bio_list, bio);
return 0;
}
@@ -1123,18 +1134,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
}
/*
- * these are just the pages from the rbio array, not from anything
- * the FS sent down to us
- */
-static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
-{
- int index;
- index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
- index += page;
- return rbio->stripe_pages[index];
-}
-
-/*
* helper function to walk our bio list and populate the bio_pages array with
* the result. This seems expensive, but it is faster than constantly
* searching through the bio list as we setup the IO in finish_rmw or stripe
@@ -1155,7 +1154,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
bio_list_for_each(bio, &rbio->bio_list) {
start = (u64)bio->bi_iter.bi_sector << 9;
stripe_offset = start - rbio->bbio->raid_map[0];
- page_index = stripe_offset >> PAGE_CACHE_SHIFT;
+ page_index = stripe_offset >> PAGE_SHIFT;
for (i = 0; i < bio->bi_vcnt; i++) {
p = bio->bi_io_vec[i].bv_page;
@@ -1177,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
{
struct btrfs_bio *bbio = rbio->bbio;
void *pointers[rbio->real_stripes];
- int stripe_len = rbio->stripe_len;
int nr_data = rbio->nr_data;
int stripe;
int pagenr;
@@ -1185,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
int q_stripe = -1;
struct bio_list bio_list;
struct bio *bio;
- int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
int ret;
bio_list_init(&bio_list);
@@ -1228,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *p;
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
@@ -1256,7 +1253,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
} else {
/* raid5 */
memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
- run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+ run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
}
@@ -1270,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
* everything else.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
@@ -1294,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
if (!bbio->tgtdev_map[stripe])
continue;
- for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
@@ -1508,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
int pagenr;
int stripe;
struct bio *bio;
@@ -1527,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
* stripe
*/
for (stripe = 0; stripe < rbio->nr_data; stripe++) {
- for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
/*
* we want to find all the pages missing from
@@ -1803,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
int pagenr, stripe;
void **pointers;
int faila = -1, failb = -1;
- int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
struct page *page;
int err;
int i;
@@ -1826,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
- for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
/*
* Now we just use bitmap to mark the horizontal stripes in
* which we have data when doing parity scrub.
@@ -1919,7 +1914,7 @@ pstripe:
/* Copy parity block into failed block to start with */
memcpy(pointers[faila],
pointers[rbio->nr_data],
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
/* rearrange the pointer array */
p = pointers[faila];
@@ -1928,7 +1923,7 @@ pstripe:
pointers[rbio->nr_data - 1] = p;
/* xor in the rest */
- run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+ run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
}
/* if we're doing this rebuild as part of an rmw, go through
* and set all of our private rbio pages in the
@@ -1937,7 +1932,7 @@ pstripe:
* other endio functions will fiddle the uptodate bits
*/
if (rbio->operation == BTRFS_RBIO_WRITE) {
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < rbio->stripe_npages; i++) {
if (faila != -1) {
page = rbio_stripe_page(rbio, faila, i);
SetPageUptodate(page);
@@ -2033,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
int bios_to_read = 0;
struct bio_list bio_list;
int ret;
- int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
int pagenr;
int stripe;
struct bio *bio;
@@ -2057,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
continue;
}
- for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+ for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *p;
/*
@@ -2256,7 +2250,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
- index = stripe_offset >> PAGE_CACHE_SHIFT;
+ index = stripe_offset >> PAGE_SHIFT;
rbio->bio_pages[index] = page;
}
@@ -2281,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
if (!page)
return -ENOMEM;
rbio->stripe_pages[index] = page;
- ClearPageUptodate(page);
}
}
return 0;
}
-/*
- * end io function used by finish_rmw. When we finally
- * get here, we've written a full stripe
- */
-static void raid_write_parity_end_io(struct bio *bio)
-{
- struct btrfs_raid_bio *rbio = bio->bi_private;
- int err = bio->bi_error;
-
- if (bio->bi_error)
- fail_bio_stripe(rbio, bio);
-
- bio_put(bio);
-
- if (!atomic_dec_and_test(&rbio->stripes_pending))
- return;
-
- err = 0;
-
- if (atomic_read(&rbio->error))
- err = -EIO;
-
- rbio_orig_end_io(rbio, err);
-}
-
static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
int need_check)
{
@@ -2397,14 +2365,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
} else {
/* raid5 */
memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
- run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+ run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
}
/* Check scrubbing pairty and repair it */
p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
parity = kmap(p);
- if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
- memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
+ if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
+ memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE);
else
/* Parity is right, needn't writeback */
bitmap_clear(rbio->dbitmap, pagenr, 1);
@@ -2464,7 +2432,7 @@ submit_write:
break;
bio->bi_private = rbio;
- bio->bi_end_io = raid_write_parity_end_io;
+ bio->bi_end_io = raid_write_end_io;
submit_bio(WRITE, bio);
}
return;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 619f92963e27..298631eaee78 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -72,7 +72,7 @@ struct reada_extent {
spinlock_t lock;
struct reada_zone *zones[BTRFS_MAX_MIRRORS];
int nzones;
- struct btrfs_device *scheduled_for;
+ int scheduled;
};
struct reada_zone {
@@ -101,67 +101,53 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info);
static void __reada_start_machine(struct btrfs_fs_info *fs_info);
static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, int level, u64 generation);
+ struct btrfs_key *top, u64 generation);
/* recurses */
/* in case of err, eb might be NULL */
-static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
- u64 start, int err)
+static void __readahead_hook(struct btrfs_fs_info *fs_info,
+ struct reada_extent *re, struct extent_buffer *eb,
+ u64 start, int err)
{
int level = 0;
int nritems;
int i;
u64 bytenr;
u64 generation;
- struct reada_extent *re;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head list;
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- struct btrfs_device *for_dev;
if (eb)
level = btrfs_header_level(eb);
- /* find extent */
- spin_lock(&fs_info->reada_lock);
- re = radix_tree_lookup(&fs_info->reada_tree, index);
- if (re)
- re->refcnt++;
- spin_unlock(&fs_info->reada_lock);
-
- if (!re)
- return -1;
-
spin_lock(&re->lock);
/*
* just take the full list from the extent. afterwards we
* don't need the lock anymore
*/
list_replace_init(&re->extctl, &list);
- for_dev = re->scheduled_for;
- re->scheduled_for = NULL;
+ re->scheduled = 0;
spin_unlock(&re->lock);
- if (err == 0) {
- nritems = level ? btrfs_header_nritems(eb) : 0;
- generation = btrfs_header_generation(eb);
- /*
- * FIXME: currently we just set nritems to 0 if this is a leaf,
- * effectively ignoring the content. In a next step we could
- * trigger more readahead depending from the content, e.g.
- * fetch the checksums for the extents in the leaf.
- */
- } else {
- /*
- * this is the error case, the extent buffer has not been
- * read correctly. We won't access anything from it and
- * just cleanup our data structures. Effectively this will
- * cut the branch below this node from read ahead.
- */
- nritems = 0;
- generation = 0;
- }
+ /*
+ * this is the error case, the extent buffer has not been
+ * read correctly. We won't access anything from it and
+ * just cleanup our data structures. Effectively this will
+ * cut the branch below this node from read ahead.
+ */
+ if (err)
+ goto cleanup;
+ /*
+ * FIXME: currently we just set nritems to 0 if this is a leaf,
+ * effectively ignoring the content. In a next step we could
+ * trigger more readahead depending from the content, e.g.
+ * fetch the checksums for the extents in the leaf.
+ */
+ if (!level)
+ goto cleanup;
+
+ nritems = btrfs_header_nritems(eb);
+ generation = btrfs_header_generation(eb);
for (i = 0; i < nritems; i++) {
struct reada_extctl *rec;
u64 n_gen;
@@ -188,19 +174,20 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
*/
#ifdef DEBUG
if (rec->generation != generation) {
- btrfs_debug(root->fs_info,
- "generation mismatch for (%llu,%d,%llu) %llu != %llu",
- key.objectid, key.type, key.offset,
- rec->generation, generation);
+ btrfs_debug(fs_info,
+ "generation mismatch for (%llu,%d,%llu) %llu != %llu",
+ key.objectid, key.type, key.offset,
+ rec->generation, generation);
}
#endif
if (rec->generation == generation &&
btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
- reada_add_block(rc, bytenr, &next_key,
- level - 1, n_gen);
+ reada_add_block(rc, bytenr, &next_key, n_gen);
}
}
+
+cleanup:
/*
* free extctl records
*/
@@ -222,26 +209,37 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
reada_extent_put(fs_info, re); /* one ref for each entry */
}
- reada_extent_put(fs_info, re); /* our ref */
- if (for_dev)
- atomic_dec(&for_dev->reada_in_flight);
- return 0;
+ return;
}
/*
* start is passed separately in case eb in NULL, which may be the case with
* failed I/O
*/
-int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
- u64 start, int err)
+int btree_readahead_hook(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, u64 start, int err)
{
- int ret;
+ int ret = 0;
+ struct reada_extent *re;
- ret = __readahead_hook(root, eb, start, err);
+ /* find extent */
+ spin_lock(&fs_info->reada_lock);
+ re = radix_tree_lookup(&fs_info->reada_tree,
+ start >> PAGE_SHIFT);
+ if (re)
+ re->refcnt++;
+ spin_unlock(&fs_info->reada_lock);
+ if (!re) {
+ ret = -1;
+ goto start_machine;
+ }
- reada_start_machine(root->fs_info);
+ __readahead_hook(fs_info, re, eb, start, err);
+ reada_extent_put(fs_info, re); /* our ref */
+start_machine:
+ reada_start_machine(fs_info);
return ret;
}
@@ -259,19 +257,15 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
zone = NULL;
spin_lock(&fs_info->reada_lock);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> PAGE_CACHE_SHIFT, 1);
- if (ret == 1)
+ logical >> PAGE_SHIFT, 1);
+ if (ret == 1 && logical >= zone->start && logical <= zone->end) {
kref_get(&zone->refcnt);
- spin_unlock(&fs_info->reada_lock);
-
- if (ret == 1) {
- if (logical >= zone->start && logical < zone->end)
- return zone;
- spin_lock(&fs_info->reada_lock);
- kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
+ return zone;
}
+ spin_unlock(&fs_info->reada_lock);
+
cache = btrfs_lookup_block_group(fs_info, logical);
if (!cache)
return NULL;
@@ -280,7 +274,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
end = start + cache->key.offset - 1;
btrfs_put_block_group(cache);
- zone = kzalloc(sizeof(*zone), GFP_NOFS);
+ zone = kzalloc(sizeof(*zone), GFP_KERNEL);
if (!zone)
return NULL;
@@ -300,15 +294,17 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&dev->reada_zones,
- (unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
+ (unsigned long)(zone->end >> PAGE_SHIFT),
zone);
if (ret == -EEXIST) {
kfree(zone);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
- logical >> PAGE_CACHE_SHIFT, 1);
- if (ret == 1)
+ logical >> PAGE_SHIFT, 1);
+ if (ret == 1 && logical >= zone->start && logical <= zone->end)
kref_get(&zone->refcnt);
+ else
+ zone = NULL;
}
spin_unlock(&fs_info->reada_lock);
@@ -317,7 +313,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
static struct reada_extent *reada_find_extent(struct btrfs_root *root,
u64 logical,
- struct btrfs_key *top, int level)
+ struct btrfs_key *top)
{
int ret;
struct reada_extent *re = NULL;
@@ -330,9 +326,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
u64 length;
int real_stripes;
int nzones = 0;
- int i;
- unsigned long index = logical >> PAGE_CACHE_SHIFT;
+ unsigned long index = logical >> PAGE_SHIFT;
int dev_replace_is_ongoing;
+ int have_zone = 0;
spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -343,7 +339,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
if (re)
return re;
- re = kzalloc(sizeof(*re), GFP_NOFS);
+ re = kzalloc(sizeof(*re), GFP_KERNEL);
if (!re)
return NULL;
@@ -375,11 +371,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
struct reada_zone *zone;
dev = bbio->stripes[nzones].dev;
+
+ /* cannot read ahead on missing device. */
+ if (!dev->bdev)
+ continue;
+
zone = reada_find_zone(fs_info, dev, logical, bbio);
if (!zone)
- break;
+ continue;
- re->zones[nzones] = zone;
+ re->zones[re->nzones++] = zone;
spin_lock(&zone->lock);
if (!zone->elems)
kref_get(&zone->refcnt);
@@ -389,14 +390,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
- re->nzones = nzones;
- if (nzones == 0) {
+ if (re->nzones == 0) {
/* not a single zone found, error and out */
goto error;
}
/* insert extent in reada_tree + all per-device trees, all or nothing */
- btrfs_dev_replace_lock(&fs_info->dev_replace);
+ btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&fs_info->reada_tree, index, re);
if (ret == -EEXIST) {
@@ -404,19 +404,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
BUG_ON(!re_exist);
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
goto error;
}
prev_dev = NULL;
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
&fs_info->dev_replace);
- for (i = 0; i < nzones; ++i) {
- dev = bbio->stripes[i].dev;
+ for (nzones = 0; nzones < re->nzones; ++nzones) {
+ dev = re->zones[nzones]->device;
+
if (dev == prev_dev) {
/*
* in case of DUP, just add the first zone. As both
@@ -427,15 +428,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
*/
continue;
}
- if (!dev->bdev) {
- /*
- * cannot read ahead on missing device, but for RAID5/6,
- * REQ_GET_READ_MIRRORS return 1. So don't skip missing
- * device for such case.
- */
- if (nzones > 1)
- continue;
- }
+ if (!dev->bdev)
+ continue;
+
if (dev_replace_is_ongoing &&
dev == fs_info->dev_replace.tgtdev) {
/*
@@ -447,8 +442,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
prev_dev = dev;
ret = radix_tree_insert(&dev->reada_extents, index, re);
if (ret) {
- while (--i >= 0) {
- dev = bbio->stripes[i].dev;
+ while (--nzones >= 0) {
+ dev = re->zones[nzones]->device;
BUG_ON(dev == NULL);
/* ignore whether the entry was inserted */
radix_tree_delete(&dev->reada_extents, index);
@@ -456,21 +451,24 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
BUG_ON(fs_info == NULL);
radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
goto error;
}
+ have_zone = 1;
}
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+
+ if (!have_zone)
+ goto error;
btrfs_put_bbio(bbio);
return re;
error:
- while (nzones) {
+ for (nzones = 0; nzones < re->nzones; ++nzones) {
struct reada_zone *zone;
- --nzones;
zone = re->zones[nzones];
kref_get(&zone->refcnt);
spin_lock(&zone->lock);
@@ -497,7 +495,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
struct reada_extent *re)
{
int i;
- unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+ unsigned long index = re->logical >> PAGE_SHIFT;
spin_lock(&fs_info->reada_lock);
if (--re->refcnt) {
@@ -531,8 +529,6 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
- if (re->scheduled_for)
- atomic_dec(&re->scheduled_for->reada_in_flight);
kfree(re);
}
@@ -542,7 +538,7 @@ static void reada_zone_release(struct kref *kref)
struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
radix_tree_delete(&zone->device->reada_zones,
- zone->end >> PAGE_CACHE_SHIFT);
+ zone->end >> PAGE_SHIFT);
kfree(zone);
}
@@ -556,17 +552,17 @@ static void reada_control_release(struct kref *kref)
}
static int reada_add_block(struct reada_control *rc, u64 logical,
- struct btrfs_key *top, int level, u64 generation)
+ struct btrfs_key *top, u64 generation)
{
struct btrfs_root *root = rc->root;
struct reada_extent *re;
struct reada_extctl *rec;
- re = reada_find_extent(root, logical, top, level); /* takes one ref */
+ re = reada_find_extent(root, logical, top); /* takes one ref */
if (!re)
return -1;
- rec = kzalloc(sizeof(*rec), GFP_NOFS);
+ rec = kzalloc(sizeof(*rec), GFP_KERNEL);
if (!rec) {
reada_extent_put(root->fs_info, re);
return -ENOMEM;
@@ -591,7 +587,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
{
int i;
- unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+ unsigned long index = zone->end >> PAGE_SHIFT;
for (i = 0; i < zone->ndevs; ++i) {
struct reada_zone *peer;
@@ -626,7 +622,7 @@ static int reada_pick_zone(struct btrfs_device *dev)
(void **)&zone, index, 1);
if (ret == 0)
break;
- index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+ index = (zone->end >> PAGE_SHIFT) + 1;
if (zone->locked) {
if (zone->elems > top_locked_elems) {
top_locked_elems = zone->elems;
@@ -662,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
u64 logical;
int ret;
int i;
- int need_kick = 0;
spin_lock(&fs_info->reada_lock);
if (dev->reada_curr_zone == NULL) {
@@ -678,8 +673,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
* plugging to speed things up
*/
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> PAGE_CACHE_SHIFT, 1);
- if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+ dev->reada_next >> PAGE_SHIFT, 1);
+ if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
ret = reada_pick_zone(dev);
if (!ret) {
spin_unlock(&fs_info->reada_lock);
@@ -687,7 +682,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
}
re = NULL;
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
- dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+ dev->reada_next >> PAGE_SHIFT, 1);
}
if (ret == 0) {
spin_unlock(&fs_info->reada_lock);
@@ -698,6 +693,15 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->reada_lock);
+ spin_lock(&re->lock);
+ if (re->scheduled || list_empty(&re->extctl)) {
+ spin_unlock(&re->lock);
+ reada_extent_put(fs_info, re);
+ return 0;
+ }
+ re->scheduled = 1;
+ spin_unlock(&re->lock);
+
/*
* find mirror num
*/
@@ -709,29 +713,20 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
}
logical = re->logical;
- spin_lock(&re->lock);
- if (re->scheduled_for == NULL) {
- re->scheduled_for = dev;
- need_kick = 1;
- }
- spin_unlock(&re->lock);
-
- reada_extent_put(fs_info, re);
-
- if (!need_kick)
- return 0;
-
atomic_inc(&dev->reada_in_flight);
ret = reada_tree_block_flagged(fs_info->extent_root, logical,
mirror_num, &eb);
if (ret)
- __readahead_hook(fs_info->extent_root, NULL, logical, ret);
+ __readahead_hook(fs_info, re, NULL, logical, ret);
else if (eb)
- __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+ __readahead_hook(fs_info, re, eb, eb->start, ret);
if (eb)
free_extent_buffer(eb);
+ atomic_dec(&dev->reada_in_flight);
+ reada_extent_put(fs_info, re);
+
return 1;
}
@@ -752,6 +747,8 @@ static void reada_start_machine_worker(struct btrfs_work *work)
set_task_ioprio(current, BTRFS_IOPRIO_READA);
__reada_start_machine(fs_info);
set_task_ioprio(current, old_ioprio);
+
+ atomic_dec(&fs_info->reada_works_cnt);
}
static void __reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -783,15 +780,19 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
* enqueue to workers to finish it. This will distribute the load to
* the cores.
*/
- for (i = 0; i < 2; ++i)
+ for (i = 0; i < 2; ++i) {
reada_start_machine(fs_info);
+ if (atomic_read(&fs_info->reada_works_cnt) >
+ BTRFS_MAX_MIRRORS * 2)
+ break;
+ }
}
static void reada_start_machine(struct btrfs_fs_info *fs_info)
{
struct reada_machine_work *rmw;
- rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+ rmw = kzalloc(sizeof(*rmw), GFP_KERNEL);
if (!rmw) {
/* FIXME we cannot handle this properly right now */
BUG();
@@ -801,6 +802,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
rmw->fs_info = fs_info;
btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
+ atomic_inc(&fs_info->reada_works_cnt);
}
#ifdef DEBUG
@@ -836,7 +838,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
printk(KERN_CONT " curr off %llu",
device->reada_next - zone->start);
printk(KERN_CONT "\n");
- index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+ index = (zone->end >> PAGE_SHIFT) + 1;
}
cnt = 0;
index = 0;
@@ -848,10 +850,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
if (ret == 0)
break;
printk(KERN_DEBUG
- " re: logical %llu size %u empty %d for %lld",
+ " re: logical %llu size %u empty %d scheduled %d",
re->logical, fs_info->tree_root->nodesize,
- list_empty(&re->extctl), re->scheduled_for ?
- re->scheduled_for->devid : -1);
+ list_empty(&re->extctl), re->scheduled);
for (i = 0; i < re->nzones; ++i) {
printk(KERN_CONT " zone %llu-%llu devs",
@@ -863,7 +864,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
}
}
printk(KERN_CONT "\n");
- index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ index = (re->logical >> PAGE_SHIFT) + 1;
if (++cnt > 15)
break;
}
@@ -878,31 +879,25 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
index, 1);
if (ret == 0)
break;
- if (!re->scheduled_for) {
- index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ if (!re->scheduled) {
+ index = (re->logical >> PAGE_SHIFT) + 1;
continue;
}
printk(KERN_DEBUG
- "re: logical %llu size %u list empty %d for %lld",
+ "re: logical %llu size %u list empty %d scheduled %d",
re->logical, fs_info->tree_root->nodesize,
- list_empty(&re->extctl),
- re->scheduled_for ? re->scheduled_for->devid : -1);
+ list_empty(&re->extctl), re->scheduled);
for (i = 0; i < re->nzones; ++i) {
printk(KERN_CONT " zone %llu-%llu devs",
re->zones[i]->start,
re->zones[i]->end);
- for (i = 0; i < re->nzones; ++i) {
- printk(KERN_CONT " zone %llu-%llu devs",
- re->zones[i]->start,
- re->zones[i]->end);
- for (j = 0; j < re->zones[i]->ndevs; ++j) {
- printk(KERN_CONT " %lld",
- re->zones[i]->devs[j]->devid);
- }
+ for (j = 0; j < re->zones[i]->ndevs; ++j) {
+ printk(KERN_CONT " %lld",
+ re->zones[i]->devs[j]->devid);
}
}
printk(KERN_CONT "\n");
- index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+ index = (re->logical >> PAGE_SHIFT) + 1;
}
spin_unlock(&fs_info->reada_lock);
}
@@ -917,7 +912,6 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
struct reada_control *rc;
u64 start;
u64 generation;
- int level;
int ret;
struct extent_buffer *node;
static struct btrfs_key max_key = {
@@ -926,7 +920,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
.offset = (u64)-1
};
- rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ rc = kzalloc(sizeof(*rc), GFP_KERNEL);
if (!rc)
return ERR_PTR(-ENOMEM);
@@ -940,11 +934,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
node = btrfs_root_node(root);
start = node->start;
- level = btrfs_header_level(node);
generation = btrfs_header_generation(node);
free_extent_buffer(node);
- ret = reada_add_block(rc, start, &max_key, level, generation);
+ ret = reada_add_block(rc, start, &max_key, generation);
if (ret) {
kfree(rc);
return ERR_PTR(ret);
@@ -959,8 +952,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
int btrfs_reada_wait(void *handle)
{
struct reada_control *rc = handle;
+ struct btrfs_fs_info *fs_info = rc->root->fs_info;
while (atomic_read(&rc->elems)) {
+ if (!atomic_read(&fs_info->reada_works_cnt))
+ reada_start_machine(fs_info);
wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
5 * HZ);
dump_devs(rc->root->fs_info,
@@ -977,9 +973,13 @@ int btrfs_reada_wait(void *handle)
int btrfs_reada_wait(void *handle)
{
struct reada_control *rc = handle;
+ struct btrfs_fs_info *fs_info = rc->root->fs_info;
while (atomic_read(&rc->elems)) {
- wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+ if (!atomic_read(&fs_info->reada_works_cnt))
+ reada_start_machine(fs_info);
+ wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+ (HZ + 9) / 10);
}
kref_put(&rc->refcnt, reada_control_release);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b4ca5454ef1a..08ef890deca6 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -575,7 +575,8 @@ static int is_cowonly_root(u64 root_objectid)
root_objectid == BTRFS_TREE_LOG_OBJECTID ||
root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
root_objectid == BTRFS_UUID_TREE_OBJECTID ||
- root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
+ root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
+ root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
return 1;
return 0;
}
@@ -708,8 +709,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
err = -ENOMEM;
goto out;
}
- path1->reada = 1;
- path2->reada = 2;
+ path1->reada = READA_FORWARD;
+ path2->reada = READA_FORWARD;
node = alloc_backref_node(cache);
if (!node) {
@@ -1849,6 +1850,7 @@ again:
eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
+ break;
} else if (!extent_buffer_uptodate(eb)) {
ret = -EIO;
free_extent_buffer(eb);
@@ -2130,7 +2132,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
@@ -3030,7 +3032,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
int ret = 0;
BUG_ON(cluster->start != cluster->boundary[0]);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = btrfs_check_data_free_space(inode, cluster->start,
cluster->end + 1 - cluster->start);
@@ -3057,7 +3059,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
btrfs_free_reserved_data_space(inode, cluster->start,
cluster->end + 1 - cluster->start);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -3128,10 +3130,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (ret)
goto out;
- index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
- last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+ index = (cluster->start - offset) >> PAGE_SHIFT;
+ last_index = (cluster->end - offset) >> PAGE_SHIFT;
while (index <= last_index) {
- ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE);
if (ret)
goto out;
@@ -3144,7 +3146,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
mask);
if (!page) {
btrfs_delalloc_release_metadata(inode,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
ret = -ENOMEM;
goto out;
}
@@ -3161,16 +3163,16 @@ static int relocate_file_extent_cluster(struct inode *inode,
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
btrfs_delalloc_release_metadata(inode,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
ret = -EIO;
goto out;
}
}
page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
+ page_end = page_start + PAGE_SIZE - 1;
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
@@ -3190,7 +3192,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
unlock_extent(&BTRFS_I(inode)->io_tree,
page_start, page_end);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
index++;
balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -3527,7 +3529,7 @@ static int find_data_references(struct reloc_control *rc,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
root = read_fs_root(rc->extent_root->fs_info, ref_root);
if (IS_ERR(root)) {
@@ -3917,7 +3919,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
ret = prepare_to_relocate(rc);
if (ret) {
@@ -4343,7 +4345,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = -1;
+ path->reada = READA_BACK;
key.objectid = BTRFS_TREE_RELOC_OBJECTID;
key.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7cf8509deda7..9fcd6dfc3266 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -310,8 +310,16 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
+ /*
+ * The root might have been inserted already, as before we look
+ * for orphan roots, log replay might have happened, which
+ * triggers a transaction commit and qgroup accounting, which
+ * in turn reads and inserts fs roots while doing backref
+ * walking.
+ */
+ if (err == -EEXIST)
+ err = 0;
if (err) {
- BUG_ON(err == -EEXIST);
btrfs_free_fs_root(root);
break;
}
@@ -488,7 +496,7 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_root_item *item = &root->root_item;
- struct timespec ct = CURRENT_TIME;
+ struct timespec ct = current_fs_time(root->fs_info->sb);
spin_lock(&root->root_item_lock);
btrfs_set_root_ctransid(item, trans->transid);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 550de89a8661..4678f03e878e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -248,14 +248,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size, int retry_failed_mirror);
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int is_metadata, int have_csum,
- const u8 *csum, u64 generation,
- u16 csum_size);
+ struct scrub_block *sblock,
+ int retry_failed_mirror);
+static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
@@ -466,7 +461,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
int ret;
- sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+ sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
if (!sctx)
goto nomem;
atomic_set(&sctx->refs, 1);
@@ -477,7 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
struct scrub_bio *sbio;
- sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
+ sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
if (!sbio)
goto nomem;
sctx->bios[i] = sbio;
@@ -616,7 +611,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
u64 flags = 0;
u64 ref_root;
u32 item_size;
- u8 ref_level;
+ u8 ref_level = 0;
int ret;
WARN_ON(sblock->page_count < 1);
@@ -708,7 +703,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
if (IS_ERR(inode))
return PTR_ERR(inode);
- index = offset >> PAGE_CACHE_SHIFT;
+ index = offset >> PAGE_SHIFT;
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page) {
@@ -889,11 +884,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
struct btrfs_fs_info *fs_info;
u64 length;
u64 logical;
- u64 generation;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
- u8 *csum;
struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
struct scrub_block *sblock_bad;
int ret;
@@ -918,13 +911,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
}
length = sblock_to_check->page_count * PAGE_SIZE;
logical = sblock_to_check->pagev[0]->logical;
- generation = sblock_to_check->pagev[0]->generation;
BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
is_metadata = !(sblock_to_check->pagev[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
have_csum = sblock_to_check->pagev[0]->have_csum;
- csum = sblock_to_check->pagev[0]->csum;
dev = sblock_to_check->pagev[0]->dev;
if (sctx->is_dev_replace && !is_metadata && !have_csum) {
@@ -987,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_bad = sblocks_for_recheck + failed_mirror_index;
/* build and submit the bios for the failed mirror, check checksums */
- scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
- csum, generation, sctx->csum_size, 1);
+ scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
@@ -1101,9 +1091,7 @@ nodatasum_case:
sblock_other = sblocks_for_recheck + mirror_index;
/* build and submit the bios, check checksums */
- scrub_recheck_block(fs_info, sblock_other, is_metadata,
- have_csum, csum, generation,
- sctx->csum_size, 0);
+ scrub_recheck_block(fs_info, sblock_other, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
@@ -1215,9 +1203,7 @@ nodatasum_case:
* is verified, but most likely the data comes out
* of the page cache.
*/
- scrub_recheck_block(fs_info, sblock_bad,
- is_metadata, have_csum, csum,
- generation, sctx->csum_size, 1);
+ scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
@@ -1318,6 +1304,9 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
u64 length = original_sblock->page_count * PAGE_SIZE;
u64 logical = original_sblock->pagev[0]->logical;
+ u64 generation = original_sblock->pagev[0]->generation;
+ u64 flags = original_sblock->pagev[0]->flags;
+ u64 have_csum = original_sblock->pagev[0]->have_csum;
struct scrub_recover *recover;
struct btrfs_bio *bbio;
u64 sublen;
@@ -1372,6 +1361,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
+
page = kzalloc(sizeof(*page), GFP_NOFS);
if (!page) {
leave_nomem:
@@ -1383,7 +1373,15 @@ leave_nomem:
}
scrub_page_get(page);
sblock->pagev[page_index] = page;
+ page->sblock = sblock;
+ page->flags = flags;
+ page->generation = generation;
page->logical = logical;
+ page->have_csum = have_csum;
+ if (have_csum)
+ memcpy(page->csum,
+ original_sblock->pagev[0]->csum,
+ sctx->csum_size);
scrub_stripe_index_and_offset(logical,
bbio->map_type,
@@ -1474,15 +1472,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
* the pages that are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size, int retry_failed_mirror)
+ struct scrub_block *sblock,
+ int retry_failed_mirror)
{
int page_num;
sblock->no_io_error_seen = 1;
- sblock->header_error = 0;
- sblock->checksum_error = 0;
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
@@ -1518,11 +1513,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
if (sblock->no_io_error_seen)
- scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
- have_csum, csum, generation,
- csum_size);
-
- return;
+ scrub_recheck_block_checksum(sblock);
}
static inline int scrub_check_fsid(u8 fsid[],
@@ -1535,61 +1526,16 @@ static inline int scrub_check_fsid(u8 fsid[],
return !ret;
}
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int is_metadata, int have_csum,
- const u8 *csum, u64 generation,
- u16 csum_size)
+static void scrub_recheck_block_checksum(struct scrub_block *sblock)
{
- int page_num;
- u8 calculated_csum[BTRFS_CSUM_SIZE];
- u32 crc = ~(u32)0;
- void *mapped_buffer;
-
- WARN_ON(!sblock->pagev[0]->page);
- if (is_metadata) {
- struct btrfs_header *h;
-
- mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
- h = (struct btrfs_header *)mapped_buffer;
-
- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
- !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
- memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
- BTRFS_UUID_SIZE)) {
- sblock->header_error = 1;
- } else if (generation != btrfs_stack_header_generation(h)) {
- sblock->header_error = 1;
- sblock->generation_error = 1;
- }
- csum = h->csum;
- } else {
- if (!have_csum)
- return;
-
- mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
- }
-
- for (page_num = 0;;) {
- if (page_num == 0 && is_metadata)
- crc = btrfs_csum_data(
- ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
- crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
- else
- crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
-
- kunmap_atomic(mapped_buffer);
- page_num++;
- if (page_num >= sblock->page_count)
- break;
- WARN_ON(!sblock->pagev[page_num]->page);
-
- mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
- }
+ sblock->header_error = 0;
+ sblock->checksum_error = 0;
+ sblock->generation_error = 0;
- btrfs_csum_final(crc, calculated_csum);
- if (memcmp(calculated_csum, csum, csum_size))
- sblock->checksum_error = 1;
+ if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+ scrub_checksum_data(sblock);
+ else
+ scrub_checksum_tree_block(sblock);
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
@@ -1690,7 +1636,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
if (spage->io_error) {
void *mapped_buffer = kmap_atomic(spage->page);
- memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+ memset(mapped_buffer, 0, PAGE_SIZE);
flush_dcache_page(spage->page);
kunmap_atomic(mapped_buffer);
}
@@ -1708,7 +1654,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
again:
if (!wr_ctx->wr_curr_bio) {
wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
- GFP_NOFS);
+ GFP_KERNEL);
if (!wr_ctx->wr_curr_bio) {
mutex_unlock(&wr_ctx->wr_lock);
return -ENOMEM;
@@ -1725,7 +1671,8 @@ again:
sbio->dev = wr_ctx->tgtdev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+ bio = btrfs_io_bio_alloc(GFP_KERNEL,
+ wr_ctx->pages_per_wr_bio);
if (!bio) {
mutex_unlock(&wr_ctx->wr_lock);
return -ENOMEM;
@@ -1833,6 +1780,18 @@ static int scrub_checksum(struct scrub_block *sblock)
u64 flags;
int ret;
+ /*
+ * No need to initialize these stats currently,
+ * because this function only use return value
+ * instead of these stats value.
+ *
+ * Todo:
+ * always use stats
+ */
+ sblock->header_error = 0;
+ sblock->generation_error = 0;
+ sblock->checksum_error = 0;
+
WARN_ON(sblock->page_count < 1);
flags = sblock->pagev[0]->flags;
ret = 0;
@@ -1858,7 +1817,6 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct page *page;
void *buffer;
u32 crc = ~(u32)0;
- int fail = 0;
u64 len;
int index;
@@ -1889,9 +1847,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
btrfs_csum_final(crc, csum);
if (memcmp(csum, on_disk_csum, sctx->csum_size))
- fail = 1;
+ sblock->checksum_error = 1;
- return fail;
+ return sblock->checksum_error;
}
static int scrub_checksum_tree_block(struct scrub_block *sblock)
@@ -1907,8 +1865,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
- int fail = 0;
- int crc_fail = 0;
u64 len;
int index;
@@ -1923,19 +1879,20 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
-
if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
- ++fail;
+ sblock->header_error = 1;
- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
- ++fail;
+ if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
+ sblock->header_error = 1;
+ sblock->generation_error = 1;
+ }
if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
- ++fail;
+ sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
- ++fail;
+ sblock->header_error = 1;
len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1960,9 +1917,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
btrfs_csum_final(crc, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
- ++crc_fail;
+ sblock->checksum_error = 1;
- return fail || crc_fail;
+ return sblock->header_error || sblock->checksum_error;
}
static int scrub_checksum_super(struct scrub_block *sblock)
@@ -2120,7 +2077,8 @@ again:
sbio->dev = spage->dev;
bio = sbio->bio;
if (!bio) {
- bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
+ bio = btrfs_io_bio_alloc(GFP_KERNEL,
+ sctx->pages_per_rd_bio);
if (!bio)
return -ENOMEM;
sbio->bio = bio;
@@ -2176,39 +2134,27 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
{
struct scrub_block *sblock = container_of(work, struct scrub_block, work);
struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
- unsigned int is_metadata;
- unsigned int have_csum;
- u8 *csum;
- u64 generation;
u64 logical;
struct btrfs_device *dev;
- is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock->pagev[0]->have_csum;
- csum = sblock->pagev[0]->csum;
- generation = sblock->pagev[0]->generation;
logical = sblock->pagev[0]->logical;
dev = sblock->pagev[0]->dev;
- if (sblock->no_io_error_seen) {
- scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
- have_csum, csum, generation,
- sctx->csum_size);
- }
+ if (sblock->no_io_error_seen)
+ scrub_recheck_block_checksum(sblock);
if (!sblock->no_io_error_seen) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- btrfs_err_rl_in_rcu(fs_info,
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
"IO error rebuilding logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else if (sblock->header_error || sblock->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
- btrfs_err_rl_in_rcu(fs_info,
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
"failed to rebuild valid logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else {
@@ -2297,7 +2243,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
struct scrub_block *sblock;
int index;
- sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+ sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2315,7 +2261,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
struct scrub_page *spage;
u64 l = min_t(u64, len, PAGE_SIZE);
- spage = kzalloc(sizeof(*spage), GFP_NOFS);
+ spage = kzalloc(sizeof(*spage), GFP_KERNEL);
if (!spage) {
leave_nomem:
spin_lock(&sctx->stat_lock);
@@ -2342,7 +2288,7 @@ leave_nomem:
spage->have_csum = 0;
}
sblock->page_count++;
- spage->page = alloc_page(GFP_NOFS);
+ spage->page = alloc_page(GFP_KERNEL);
if (!spage->page)
goto leave_nomem;
len -= l;
@@ -2500,8 +2446,7 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
}
-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
- u8 *csum)
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
{
struct btrfs_ordered_sum *sum = NULL;
unsigned long index;
@@ -2565,7 +2510,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
++sctx->stat.no_csum;
if (sctx->is_dev_replace && !have_csum) {
@@ -2598,7 +2543,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
struct scrub_block *sblock;
int index;
- sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+ sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
if (!sblock) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@@ -2618,7 +2563,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
struct scrub_page *spage;
u64 l = min_t(u64, len, PAGE_SIZE);
- spage = kzalloc(sizeof(*spage), GFP_NOFS);
+ spage = kzalloc(sizeof(*spage), GFP_KERNEL);
if (!spage) {
leave_nomem:
spin_lock(&sctx->stat_lock);
@@ -2648,7 +2593,7 @@ leave_nomem:
spage->have_csum = 0;
}
sblock->page_count++;
- spage->page = alloc_page(GFP_NOFS);
+ spage->page = alloc_page(GFP_KERNEL);
if (!spage->page)
goto leave_nomem;
len -= l;
@@ -2703,7 +2648,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
goto skip;
}
@@ -2870,7 +2815,7 @@ out:
static inline int scrub_calc_parity_bitmap_len(int nsectors)
{
- return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+ return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
}
static void scrub_parity_get(struct scrub_parity *sparity)
@@ -3012,6 +2957,9 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
logic_start + map->stripe_len)) {
btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
key.objectid, logic_start);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
goto next;
}
again:
@@ -3361,6 +3309,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
"scrub: tree block %llu spanning "
"stripes, ignored. logical=%llu",
key.objectid, logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
goto next;
}
@@ -3481,7 +3432,9 @@ out:
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
- u64 dev_offset, int is_dev_replace)
+ u64 dev_offset,
+ struct btrfs_block_group_cache *cache,
+ int is_dev_replace)
{
struct btrfs_mapping_tree *map_tree =
&sctx->dev_root->fs_info->mapping_tree;
@@ -3494,10 +3447,20 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
read_unlock(&map_tree->map_tree.lock);
- if (!em)
- return -EINVAL;
+ if (!em) {
+ /*
+ * Might have been an unused block group deleted by the cleaner
+ * kthread or relocation.
+ */
+ spin_lock(&cache->lock);
+ if (!cache->removed)
+ ret = -EINVAL;
+ spin_unlock(&cache->lock);
- map = (struct map_lookup *)em->bdev;
+ return ret;
+ }
+
+ map = em->map_lookup;
if (em->start != chunk_offset)
goto out;
@@ -3532,6 +3495,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
u64 length;
u64 chunk_offset;
int ret = 0;
+ int ro_set;
int slot;
struct extent_buffer *l;
struct btrfs_key key;
@@ -3543,7 +3507,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -3617,7 +3581,21 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_on(fs_info);
ret = btrfs_inc_block_group_ro(root, cache);
scrub_pause_off(fs_info);
- if (ret) {
+
+ if (ret == 0) {
+ ro_set = 1;
+ } else if (ret == -ENOSPC) {
+ /*
+ * btrfs_inc_block_group_ro return -ENOSPC when it
+ * failed in creating new chunk for metadata.
+ * It is not a problem for scrub/replace, because
+ * metadata are always cowed, and our scrub paused
+ * commit_transactions.
+ */
+ ro_set = 0;
+ } else {
+ btrfs_warn(fs_info, "failed setting block group ro, ret=%d\n",
+ ret);
btrfs_put_block_group(cache);
break;
}
@@ -3626,7 +3604,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
- found_key.offset, is_dev_replace);
+ found_key.offset, cache, is_dev_replace);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3660,7 +3638,30 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
- btrfs_dec_block_group_ro(root, cache);
+ if (ro_set)
+ btrfs_dec_block_group_ro(root, cache);
+
+ /*
+ * We might have prevented the cleaner kthread from deleting
+ * this block group if it was already unused because we raced
+ * and set it to RO mode first. So add it back to the unused
+ * list, otherwise it might not ever be deleted unless a manual
+ * balance is triggered or it becomes used and unused again.
+ */
+ spin_lock(&cache->lock);
+ if (!cache->removed && !cache->ro && cache->reserved == 0 &&
+ btrfs_block_group_used(&cache->item) == 0) {
+ spin_unlock(&cache->lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &fs_info->unused_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ } else {
+ spin_unlock(&cache->lock);
+ }
btrfs_put_block_group(cache);
if (ret)
@@ -3734,27 +3735,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (fs_info->scrub_workers_refcnt == 0) {
if (is_dev_replace)
fs_info->scrub_workers =
- btrfs_alloc_workqueue("btrfs-scrub", flags,
+ btrfs_alloc_workqueue("scrub", flags,
1, 4);
else
fs_info->scrub_workers =
- btrfs_alloc_workqueue("btrfs-scrub", flags,
+ btrfs_alloc_workqueue("scrub", flags,
max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+ btrfs_alloc_workqueue("scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
fs_info->scrub_nocow_workers =
- btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+ btrfs_alloc_workqueue("scrubnc", flags, 1, 0);
if (!fs_info->scrub_nocow_workers)
goto fail_scrub_nocow_workers;
fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+ btrfs_alloc_workqueue("scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
@@ -3858,16 +3859,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -EIO;
}
- btrfs_dev_replace_lock(&fs_info->dev_replace);
+ btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
if (dev->scrub_device ||
(!is_dev_replace &&
btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
- btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return -EINPROGRESS;
}
- btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
ret = scrub_workers_get(fs_info, is_dev_replace);
if (ret) {
@@ -4210,7 +4211,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
io_tree = &BTRFS_I(inode)->io_tree;
- lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+ lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
if (ordered) {
btrfs_put_ordered_extent(ordered);
@@ -4280,7 +4281,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
return PTR_ERR(inode);
/* Avoid truncate/dio/punch hole.. */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
inode_dio_wait(inode);
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
@@ -4293,8 +4294,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
goto out;
}
- while (len >= PAGE_CACHE_SIZE) {
- index = offset >> PAGE_CACHE_SHIFT;
+ while (len >= PAGE_SIZE) {
+ index = offset >> PAGE_SHIFT;
again:
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page) {
@@ -4325,7 +4326,7 @@ again:
*/
if (page->mapping != inode->i_mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto again;
}
if (!PageUptodate(page)) {
@@ -4347,19 +4348,19 @@ again:
ret = err;
next_page:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (ret)
break;
- offset += PAGE_CACHE_SIZE;
- physical_for_dev_replace += PAGE_CACHE_SIZE;
- nocow_ctx_logical += PAGE_CACHE_SIZE;
- len -= PAGE_CACHE_SIZE;
+ offset += PAGE_SIZE;
+ physical_for_dev_replace += PAGE_SIZE;
+ nocow_ctx_logical += PAGE_SIZE;
+ len -= PAGE_SIZE;
}
ret = COPY_COMPLETE;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
return ret;
}
@@ -4389,8 +4390,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
bio->bi_bdev = dev->bdev;
- ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
- if (ret != PAGE_CACHE_SIZE) {
+ ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+ if (ret != PAGE_SIZE) {
leave_with_eio:
bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 355a458cba1a..8d358c547c59 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -34,6 +34,7 @@
#include "disk-io.h"
#include "btrfs_inode.h"
#include "transaction.h"
+#include "compression.h"
static int g_verbose = 0;
@@ -304,7 +305,7 @@ static struct fs_path *fs_path_alloc(void)
{
struct fs_path *p;
- p = kmalloc(sizeof(*p), GFP_NOFS);
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return NULL;
p->reversed = 0;
@@ -363,11 +364,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
* First time the inline_buf does not suffice
*/
if (p->buf == p->inline_buf) {
- tmp_buf = kmalloc(len, GFP_NOFS);
+ tmp_buf = kmalloc(len, GFP_KERNEL);
if (tmp_buf)
memcpy(tmp_buf, p->buf, old_buf_len);
} else {
- tmp_buf = krealloc(p->buf, len, GFP_NOFS);
+ tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
}
if (!tmp_buf)
return -ENOMEM;
@@ -995,7 +996,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
* values are small.
*/
buf_len = PATH_MAX;
- buf = kmalloc(buf_len, GFP_NOFS);
+ buf = kmalloc(buf_len, GFP_KERNEL);
if (!buf) {
ret = -ENOMEM;
goto out;
@@ -1042,7 +1043,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
buf = NULL;
} else {
char *tmp = krealloc(buf, buf_len,
- GFP_NOFS | __GFP_NOWARN);
+ GFP_KERNEL | __GFP_NOWARN);
if (!tmp)
kfree(buf);
@@ -1303,7 +1304,7 @@ static int find_extent_clone(struct send_ctx *sctx,
/* We only use this path under the commit sem */
tmp_path->need_commit_sem = 0;
- backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
+ backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
if (!backref_ctx) {
ret = -ENOMEM;
goto out;
@@ -1469,7 +1470,21 @@ static int read_symlink(struct btrfs_root *root,
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret);
+ if (ret) {
+ /*
+ * An empty symlink inode. Can happen in rare error paths when
+ * creating a symlink (transaction committed before the inode
+ * eviction handler removed the symlink inode items and a crash
+ * happened in between or the subvol was snapshoted in between).
+ * Print an informative message to dmesg/syslog so that the user
+ * can delete the symlink.
+ */
+ btrfs_err(root->fs_info,
+ "Found empty symlink inode %llu at root %llu",
+ ino, root->root_key.objectid);
+ ret = -EIO;
+ goto out;
+ }
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
@@ -1970,7 +1985,7 @@ static int name_cache_insert(struct send_ctx *sctx,
nce_head = radix_tree_lookup(&sctx->name_cache,
(unsigned long)nce->ino);
if (!nce_head) {
- nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
+ nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
if (!nce_head) {
kfree(nce);
return -ENOMEM;
@@ -2165,7 +2180,7 @@ out_cache:
/*
* Store the result of the lookup in the name cache.
*/
- nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_NOFS);
+ nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
if (!nce) {
ret = -ENOMEM;
goto out;
@@ -2301,7 +2316,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
if (!path)
return -ENOMEM;
- name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_NOFS);
+ name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
if (!name) {
btrfs_free_path(path);
return -ENOMEM;
@@ -2716,7 +2731,7 @@ static int __record_ref(struct list_head *head, u64 dir,
{
struct recorded_ref *ref;
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ ref = kmalloc(sizeof(*ref), GFP_KERNEL);
if (!ref)
return -ENOMEM;
@@ -2741,7 +2756,7 @@ static int dup_ref(struct recorded_ref *ref, struct list_head *list)
{
struct recorded_ref *new;
- new = kmalloc(sizeof(*ref), GFP_NOFS);
+ new = kmalloc(sizeof(*ref), GFP_KERNEL);
if (!new)
return -ENOMEM;
@@ -2804,7 +2819,7 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
struct rb_node *parent = NULL;
struct orphan_dir_info *entry, *odi;
- odi = kmalloc(sizeof(*odi), GFP_NOFS);
+ odi = kmalloc(sizeof(*odi), GFP_KERNEL);
if (!odi)
return ERR_PTR(-ENOMEM);
odi->ino = dir_ino;
@@ -2959,7 +2974,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
struct rb_node *parent = NULL;
struct waiting_dir_move *entry, *dm;
- dm = kmalloc(sizeof(*dm), GFP_NOFS);
+ dm = kmalloc(sizeof(*dm), GFP_KERNEL);
if (!dm)
return -ENOMEM;
dm->ino = ino;
@@ -3026,7 +3041,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
int exists = 0;
int ret;
- pm = kmalloc(sizeof(*pm), GFP_NOFS);
+ pm = kmalloc(sizeof(*pm), GFP_KERNEL);
if (!pm)
return -ENOMEM;
pm->parent_ino = parent_ino;
@@ -4266,7 +4281,7 @@ static int __find_xattr(int num, struct btrfs_key *di_key,
strncmp(name, ctx->name, name_len) == 0) {
ctx->found_idx = num;
ctx->found_data_len = data_len;
- ctx->found_data = kmemdup(data, data_len, GFP_NOFS);
+ ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
if (!ctx->found_data)
return -ENOMEM;
return 1;
@@ -4434,9 +4449,9 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
struct page *page;
char *addr;
struct btrfs_key key;
- pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+ pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
- unsigned pg_offset = offset & ~PAGE_CACHE_MASK;
+ unsigned pg_offset = offset & ~PAGE_MASK;
ssize_t ret = 0;
key.objectid = sctx->cur_ino;
@@ -4456,7 +4471,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
if (len == 0)
goto out;
- last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (offset + len - 1) >> PAGE_SHIFT;
/* initial readahead */
memset(&sctx->ra, 0, sizeof(struct file_ra_state));
@@ -4466,8 +4481,8 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
while (index <= last_index) {
unsigned cur_len = min_t(unsigned, len,
- PAGE_CACHE_SIZE - pg_offset);
- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ PAGE_SIZE - pg_offset);
+ page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
ret = -ENOMEM;
break;
@@ -4478,7 +4493,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ret = -EIO;
break;
}
@@ -4488,7 +4503,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
kunmap(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
index++;
pg_offset = 0;
len -= cur_len;
@@ -4789,7 +4804,7 @@ static int clone_range(struct send_ctx *sctx,
type = btrfs_file_extent_type(leaf, ei);
if (type == BTRFS_FILE_EXTENT_INLINE) {
ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
- ext_len = PAGE_CACHE_ALIGN(ext_len);
+ ext_len = PAGE_ALIGN(ext_len);
} else {
ext_len = btrfs_file_extent_num_bytes(leaf, ei);
}
@@ -4871,7 +4886,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
* but there may be items after this page. Make
* sure to send the whole thing
*/
- len = PAGE_CACHE_ALIGN(len);
+ len = PAGE_ALIGN(len);
} else {
len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
}
@@ -5975,7 +5990,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
}
- sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
+ sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
if (!sctx) {
ret = -ENOMEM;
goto out;
@@ -5983,7 +5998,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
- INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
+ INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
INIT_LIST_HEAD(&sctx->name_cache_list);
sctx->flags = arg->flags;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 48d425aef05b..02e00166c4da 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -22,8 +22,8 @@
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
#define BTRFS_SEND_STREAM_VERSION 1
-#define BTRFS_SEND_BUF_SIZE (1024 * 64)
-#define BTRFS_SEND_READ_SIZE (1024 * 48)
+#define BTRFS_SEND_BUF_SIZE SZ_64K
+#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
enum btrfs_tlv_type {
BTRFS_TLV_U8,
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index b976597b0721..e05619f241be 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -66,7 +66,7 @@ u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \
\
if (token && token->kaddr && token->offset <= offset && \
token->eb == eb && \
- (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
+ (token->offset + PAGE_SIZE >= offset + size)) { \
kaddr = token->kaddr; \
p = kaddr + part_offset - token->offset; \
res = get_unaligned_le##bits(p + off); \
@@ -104,7 +104,7 @@ void btrfs_set_token_##bits(struct extent_buffer *eb, \
\
if (token && token->kaddr && token->offset <= offset && \
token->eb == eb && \
- (token->offset + PAGE_CACHE_SIZE >= offset + size)) { \
+ (token->offset + PAGE_SIZE >= offset + size)) { \
kaddr = token->kaddr; \
p = kaddr + part_offset - token->offset; \
put_unaligned_le##bits(val, p + off); \
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 24154e422945..00b8f37cc306 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -295,21 +295,23 @@ enum {
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
- Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
- Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
- Opt_check_integrity, Opt_check_integrity_including_extent_data,
+ Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
+ Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
+ Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+ Opt_skip_balance, Opt_check_integrity,
+ Opt_check_integrity_including_extent_data,
Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
- Opt_datasum, Opt_treelog, Opt_noinode_cache,
+ Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
+ Opt_nologreplay, Opt_norecovery,
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
Opt_err,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_degraded, "degraded"},
{Opt_subvol, "subvol=%s"},
{Opt_subvolid, "subvolid=%s"},
@@ -334,12 +336,15 @@ static match_table_t tokens = {
{Opt_noacl, "noacl"},
{Opt_notreelog, "notreelog"},
{Opt_treelog, "treelog"},
+ {Opt_nologreplay, "nologreplay"},
+ {Opt_norecovery, "norecovery"},
{Opt_flushoncommit, "flushoncommit"},
{Opt_noflushoncommit, "noflushoncommit"},
{Opt_ratio, "metadata_ratio=%d"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
{Opt_space_cache, "space_cache"},
+ {Opt_space_cache_version, "space_cache=%s"},
{Opt_clear_cache, "clear_cache"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
{Opt_enospc_debug, "enospc_debug"},
@@ -350,7 +355,8 @@ static match_table_t tokens = {
{Opt_inode_cache, "inode_cache"},
{Opt_noinode_cache, "noinode_cache"},
{Opt_no_space_cache, "nospace_cache"},
- {Opt_recovery, "recovery"},
+ {Opt_recovery, "recovery"}, /* deprecated */
+ {Opt_usebackuproot, "usebackuproot"},
{Opt_skip_balance, "skip_balance"},
{Opt_check_integrity, "check_int"},
{Opt_check_integrity_including_extent_data, "check_int_data"},
@@ -371,7 +377,8 @@ static match_table_t tokens = {
* reading in a new superblock is parsed here.
* XXX JDM: This needs to be cleaned up for remount.
*/
-int btrfs_parse_options(struct btrfs_root *root, char *options)
+int btrfs_parse_options(struct btrfs_root *root, char *options,
+ unsigned long new_flags)
{
struct btrfs_fs_info *info = root->fs_info;
substring_t args[MAX_OPT_ARGS];
@@ -381,13 +388,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
int ret = 0;
char *compress_type;
bool compress_force = false;
+ enum btrfs_compression_type saved_compress_type;
+ bool saved_compress_force;
+ int no_compress = 0;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
- if (cache_gen)
+ if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
+ btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
+ else if (cache_gen)
btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ /*
+ * Even the options are empty, we still need to do extra check
+ * against new flags
+ */
if (!options)
- goto out;
+ goto check;
/*
* strsep changes the string, duplicate it because parse_options
@@ -458,6 +474,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
/* Fallthrough */
case Opt_compress:
case Opt_compress_type:
+ saved_compress_type = btrfs_test_opt(root, COMPRESS) ?
+ info->compress_type : BTRFS_COMPRESS_NONE;
+ saved_compress_force =
+ btrfs_test_opt(root, FORCE_COMPRESS);
if (token == Opt_compress ||
token == Opt_compress_force ||
strcmp(args[0].from, "zlib") == 0) {
@@ -466,6 +486,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
+ no_compress = 0;
} else if (strcmp(args[0].from, "lzo") == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
@@ -473,25 +494,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
btrfs_set_fs_incompat(info, COMPRESS_LZO);
+ no_compress = 0;
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
compress_force = false;
+ no_compress++;
} else {
ret = -EINVAL;
goto out;
}
if (compress_force) {
- btrfs_set_and_info(root, FORCE_COMPRESS,
- "force %s compression",
- compress_type);
+ btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
} else {
- if (!btrfs_test_opt(root, COMPRESS))
- btrfs_info(root->fs_info,
- "btrfs: use %s compression",
- compress_type);
/*
* If we remount from compress-force=xxx to
* compress=xxx, we need clear FORCE_COMPRESS
@@ -500,6 +517,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
+ if ((btrfs_test_opt(root, COMPRESS) &&
+ (info->compress_type != saved_compress_type ||
+ compress_force != saved_compress_force)) ||
+ (!btrfs_test_opt(root, COMPRESS) &&
+ no_compress == 1)) {
+ btrfs_info(root->fs_info,
+ "%s %s compression",
+ (compress_force) ? "force" : "use",
+ compress_type);
+ }
+ compress_force = false;
break;
case Opt_ssd:
btrfs_set_and_info(root, SSD,
@@ -587,6 +615,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_clear_and_info(root, NOTREELOG,
"enabling tree log");
break;
+ case Opt_norecovery:
+ case Opt_nologreplay:
+ btrfs_set_and_info(root, NOLOGREPLAY,
+ "disabling log replay at mount time");
+ break;
case Opt_flushoncommit:
btrfs_set_and_info(root, FLUSHONCOMMIT,
"turning on flush-on-commit");
@@ -617,15 +650,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
"turning off discard");
break;
case Opt_space_cache:
- btrfs_set_and_info(root, SPACE_CACHE,
- "enabling disk space caching");
+ case Opt_space_cache_version:
+ if (token == Opt_space_cache ||
+ strcmp(args[0].from, "v1") == 0) {
+ btrfs_clear_opt(root->fs_info->mount_opt,
+ FREE_SPACE_TREE);
+ btrfs_set_and_info(root, SPACE_CACHE,
+ "enabling disk space caching");
+ } else if (strcmp(args[0].from, "v2") == 0) {
+ btrfs_clear_opt(root->fs_info->mount_opt,
+ SPACE_CACHE);
+ btrfs_set_and_info(root, FREE_SPACE_TREE,
+ "enabling free space tree");
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
break;
case Opt_rescan_uuid_tree:
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
- btrfs_clear_and_info(root, SPACE_CACHE,
- "disabling disk space caching");
+ if (btrfs_test_opt(root, SPACE_CACHE)) {
+ btrfs_clear_and_info(root, SPACE_CACHE,
+ "disabling disk space caching");
+ }
+ if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
+ btrfs_clear_and_info(root, FREE_SPACE_TREE,
+ "disabling free space tree");
+ }
break;
case Opt_inode_cache:
btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
@@ -657,8 +710,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
"disabling auto defrag");
break;
case Opt_recovery:
- btrfs_info(root->fs_info, "enabling auto recovery");
- btrfs_set_opt(info->mount_opt, RECOVERY);
+ btrfs_warn(root->fs_info,
+ "'recovery' is deprecated, use 'usebackuproot' instead");
+ case Opt_usebackuproot:
+ btrfs_info(root->fs_info,
+ "trying to use backup root at mount time");
+ btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
break;
case Opt_skip_balance:
btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
@@ -753,9 +810,27 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
break;
}
}
+check:
+ /*
+ * Extra check for current option against current flag
+ */
+ if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
+ btrfs_err(root->fs_info,
+ "nologreplay must be used with ro mount option");
+ ret = -EINVAL;
+ }
out:
+ if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(root, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(root, CLEAR_CACHE)) {
+ btrfs_err(root->fs_info, "cannot disable free space tree");
+ ret = -EINVAL;
+
+ }
if (!ret && btrfs_test_opt(root, SPACE_CACHE))
btrfs_info(root->fs_info, "disk space caching is enabled");
+ if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+ btrfs_info(root->fs_info, "using free space tree");
kfree(orig);
return ret;
}
@@ -1154,6 +1229,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",ssd");
if (btrfs_test_opt(root, NOTREELOG))
seq_puts(seq, ",notreelog");
+ if (btrfs_test_opt(root, NOLOGREPLAY))
+ seq_puts(seq, ",nologreplay");
if (btrfs_test_opt(root, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
if (btrfs_test_opt(root, DISCARD))
@@ -1162,6 +1239,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",noacl");
if (btrfs_test_opt(root, SPACE_CACHE))
seq_puts(seq, ",space_cache");
+ else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+ seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
if (btrfs_test_opt(root, RESCAN_UUID_TREE))
@@ -1178,8 +1257,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",inode_cache");
if (btrfs_test_opt(root, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
- if (btrfs_test_opt(root, RECOVERY))
- seq_puts(seq, ",recovery");
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
seq_puts(seq, ",check_int_data");
@@ -1514,9 +1591,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if ((flags ^ s->s_flags) & MS_RDONLY)
error = -EBUSY;
} else {
- char b[BDEVNAME_SIZE];
-
- strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
@@ -1637,7 +1712,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
}
- ret = btrfs_parse_options(root, data);
+ ret = btrfs_parse_options(root, data, *flags);
if (ret) {
ret = -EINVAL;
goto restore;
@@ -1865,7 +1940,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* btrfs starts at an offset of at least 1MB when doing chunk
* allocation.
*/
- skip_space = 1024 * 1024;
+ skip_space = SZ_1M;
/* user can set the offset in fs_info->alloc_start. */
if (fs_info->alloc_start &&
@@ -1956,6 +2031,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* there are other factors that may change the result (like a new metadata
* chunk).
*
+ * If metadata is exhausted, f_bavail will be 0.
+ *
* FIXME: not accurate for mixed block groups, total and free/used are ok,
* available appears slightly larger.
*/
@@ -1967,11 +2044,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
+ u64 total_free_meta = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)fs_info->fsid;
unsigned factor = 1;
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
int ret;
+ u64 thresh = 0;
/*
* holding chunk_muext to avoid allocating new chunks, holding
@@ -1997,6 +2076,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
}
}
}
+ if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
+ total_free_meta += found->disk_total - found->disk_used;
total_used += found->disk_used;
}
@@ -2019,6 +2100,24 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
+ /*
+ * We calculate the remaining metadata space minus global reserve. If
+ * this is (supposedly) smaller than zero, there's no space. But this
+ * does not hold in practice, the exhausted state happens where's still
+ * some positive delta. So we apply some guesswork and compare the
+ * delta to a 4M threshold. (Practically observed delta was ~2M.)
+ *
+ * We probably cannot calculate the exact threshold value because this
+ * depends on the internal reservations requested by various
+ * operations, so some operations that consume a few metadata will
+ * succeed even if the Avail is zero. But this is better than the other
+ * way around.
+ */
+ thresh = 4 * 1024 * 1024;
+
+ if (total_free_meta - thresh < block_rsv->size)
+ buf->f_bavail = 0;
+
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_namelen = BTRFS_NAME_LEN;
@@ -2091,6 +2190,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
break;
ret = !(fs_devices->num_devices == fs_devices->total_devices);
break;
+ case BTRFS_IOC_GET_SUPPORTED_FEATURES:
+ ret = btrfs_ioctl_get_supported_features((void __user*)arg);
+ break;
}
kfree(vol);
@@ -2189,7 +2291,7 @@ static void btrfs_interface_exit(void)
misc_deregister(&btrfs_misc);
}
-static void btrfs_print_info(void)
+static void btrfs_print_mod_info(void)
{
printk(KERN_INFO "Btrfs loaded"
#ifdef CONFIG_BTRFS_DEBUG
@@ -2225,6 +2327,9 @@ static int btrfs_run_sanity_tests(void)
if (ret)
goto out;
ret = btrfs_test_qgroups();
+ if (ret)
+ goto out;
+ ret = btrfs_test_free_space_tree();
out:
btrfs_destroy_test_fs();
return ret;
@@ -2288,7 +2393,7 @@ static int __init init_btrfs_fs(void)
btrfs_init_lockdep();
- btrfs_print_info();
+ btrfs_print_mod_info();
err = btrfs_run_sanity_tests();
if (err)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e0ac85949067..539e7b5e3f86 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -202,6 +202,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -213,6 +214,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(raid56),
BTRFS_FEAT_ATTR_PTR(skinny_metadata),
BTRFS_FEAT_ATTR_PTR(no_holes),
+ BTRFS_FEAT_ATTR_PTR(free_space_tree),
NULL
};
@@ -780,6 +782,39 @@ failure:
return error;
}
+
+/*
+ * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
+ * values in superblock. Call after any changes to incompat/compat_ro flags
+ */
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+ u64 bit, enum btrfs_feature_set set)
+{
+ struct btrfs_fs_devices *fs_devs;
+ struct kobject *fsid_kobj;
+ u64 features;
+ int ret;
+
+ if (!fs_info)
+ return;
+
+ features = get_features(fs_info, set);
+ ASSERT(bit & supported_feature_masks[set]);
+
+ fs_devs = fs_info->fs_devices;
+ fsid_kobj = &fs_devs->fsid_kobj;
+
+ if (!fsid_kobj->state_initialized)
+ return;
+
+ /*
+ * FIXME: this is too heavy to update just one value, ideally we'd like
+ * to use sysfs_update_group but some refactoring is needed first.
+ */
+ sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+ ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+}
+
static int btrfs_init_debugfs(void)
{
#ifdef CONFIG_DEBUG_FS
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 9c09522125a6..d7da1a4c2f6c 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -56,7 +56,7 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \
#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
- BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+ BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature)
#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
@@ -90,4 +90,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
struct kobject *parent);
int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+ u64 bit, enum btrfs_feature_set set);
+
#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9626252ee6b4..f54bf450bad3 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -21,6 +21,9 @@
#include <linux/magic.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../free-space-cache.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
#include "../volumes.h"
#include "../disk-io.h"
#include "../qgroup.h"
@@ -79,18 +82,18 @@ void btrfs_destroy_test_fs(void)
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
{
struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
- GFP_NOFS);
+ GFP_KERNEL);
if (!fs_info)
return fs_info;
fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
- GFP_NOFS);
+ GFP_KERNEL);
if (!fs_info->fs_devices) {
kfree(fs_info);
return NULL;
}
fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
- GFP_NOFS);
+ GFP_KERNEL);
if (!fs_info->super_copy) {
kfree(fs_info->fs_devices);
kfree(fs_info);
@@ -122,6 +125,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+ extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
return fs_info;
}
@@ -131,7 +137,6 @@ static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
void **slot;
spin_lock(&fs_info->buffer_lock);
-restart:
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
struct extent_buffer *eb;
@@ -141,7 +146,7 @@ restart:
/* Shouldn't happen but that kind of thinking creates CVE's */
if (radix_tree_exception(eb)) {
if (radix_tree_deref_retry(eb))
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
continue;
}
spin_unlock(&fs_info->buffer_lock);
@@ -169,3 +174,49 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
kfree(root);
}
+struct btrfs_block_group_cache *
+btrfs_alloc_dummy_block_group(unsigned long length)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+ if (!cache)
+ return NULL;
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_KERNEL);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->key.objectid = 0;
+ cache->key.offset = length;
+ cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->sectorsize = 4096;
+ cache->full_stripe_len = 4096;
+
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->bg_list);
+ btrfs_init_free_space_ctl(cache);
+ mutex_init(&cache->free_space_lock);
+
+ return cache;
+}
+
+void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
+{
+ if (!cache)
+ return;
+ __btrfs_remove_free_space_cache(cache->free_space_ctl);
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+}
+
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->transid = 1;
+ INIT_LIST_HEAD(&trans->qgroup_ref_list);
+ trans->type = __TRANS_DUMMY;
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index fd3954224480..054b8c73c951 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,17 +24,23 @@
#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
struct btrfs_root;
+struct btrfs_trans_handle;
int btrfs_test_free_space_cache(void);
int btrfs_test_extent_buffer_operations(void);
int btrfs_test_extent_io(void);
int btrfs_test_inodes(void);
int btrfs_test_qgroups(void);
+int btrfs_test_free_space_tree(void);
int btrfs_init_test_fs(void);
void btrfs_destroy_test_fs(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
void btrfs_free_dummy_root(struct btrfs_root *root);
+struct btrfs_block_group_cache *
+btrfs_alloc_dummy_block_group(unsigned long length);
+void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
#else
static inline int btrfs_test_free_space_cache(void)
{
@@ -63,6 +69,10 @@ static inline int btrfs_test_qgroups(void)
{
return 0;
}
+static inline int btrfs_test_free_space_tree(void)
+{
+ return 0;
+}
#endif
#endif
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 9e9f2368177d..70948b13bc81 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -18,6 +18,8 @@
#include <linux/pagemap.h>
#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sizes.h>
#include "btrfs-tests.h"
#include "../extent_io.h"
@@ -30,8 +32,8 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
{
int ret;
struct page *pages[16];
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long index = start >> PAGE_SHIFT;
+ unsigned long end_index = end >> PAGE_SHIFT;
unsigned long nr_pages = end_index - index + 1;
int i;
int count = 0;
@@ -47,9 +49,9 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
count++;
if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
unlock_page(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
if (flags & PROCESS_RELEASE)
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
nr_pages -= ret;
index += ret;
@@ -70,12 +72,14 @@ static int test_find_delalloc(void)
struct page *page;
struct page *locked_page = NULL;
unsigned long index = 0;
- u64 total_dirty = 256 * 1024 * 1024;
- u64 max_bytes = 128 * 1024 * 1024;
+ u64 total_dirty = SZ_256M;
+ u64 max_bytes = SZ_128M;
u64 start, end, test_start;
u64 found;
int ret = -EINVAL;
+ test_msg("Running find delalloc tests\n");
+
inode = btrfs_new_test_inode();
if (!inode) {
test_msg("Failed to allocate test inode\n");
@@ -89,8 +93,8 @@ static int test_find_delalloc(void)
* everything to make sure our pages don't get evicted and screw up our
* test.
*/
- for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
- page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
+ page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
test_msg("Failed to allocate test page\n");
ret = -ENOMEM;
@@ -100,7 +104,7 @@ static int test_find_delalloc(void)
if (index) {
unlock_page(page);
} else {
- page_cache_get(page);
+ get_page(page);
locked_page = page;
}
}
@@ -109,7 +113,7 @@ static int test_find_delalloc(void)
* |--- delalloc ---|
* |--- search ---|
*/
- set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+ set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL);
start = 0;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -125,7 +129,7 @@ static int test_find_delalloc(void)
}
unlock_extent(&tmp, start, end);
unlock_page(locked_page);
- page_cache_release(locked_page);
+ put_page(locked_page);
/*
* Test this scenario
@@ -133,14 +137,14 @@ static int test_find_delalloc(void)
* |--- delalloc ---|
* |--- search ---|
*/
- test_start = 64 * 1024 * 1024;
+ test_start = SZ_64M;
locked_page = find_lock_page(inode->i_mapping,
- test_start >> PAGE_CACHE_SHIFT);
+ test_start >> PAGE_SHIFT);
if (!locked_page) {
test_msg("Couldn't find the locked page\n");
goto out_bits;
}
- set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+ set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -161,7 +165,7 @@ static int test_find_delalloc(void)
}
unlock_extent(&tmp, start, end);
/* locked_page was unlocked above */
- page_cache_release(locked_page);
+ put_page(locked_page);
/*
* Test this scenario
@@ -170,7 +174,7 @@ static int test_find_delalloc(void)
*/
test_start = max_bytes + 4096;
locked_page = find_lock_page(inode->i_mapping, test_start >>
- PAGE_CACHE_SHIFT);
+ PAGE_SHIFT);
if (!locked_page) {
test_msg("Could'nt find the locked page\n");
goto out_bits;
@@ -195,7 +199,7 @@ static int test_find_delalloc(void)
*
* We are re-using our test_start from above since it works out well.
*/
- set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+ set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL);
start = test_start;
end = 0;
found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -220,14 +224,14 @@ static int test_find_delalloc(void)
* Now to test where we run into a page that is no longer dirty in the
* range we want to find.
*/
- page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
- >> PAGE_CACHE_SHIFT);
+ page = find_get_page(inode->i_mapping,
+ (max_bytes + SZ_1M) >> PAGE_SHIFT);
if (!page) {
test_msg("Couldn't find our page\n");
goto out_bits;
}
ClearPageDirty(page);
- page_cache_release(page);
+ put_page(page);
/* We unlocked it in the previous test */
lock_page(locked_page);
@@ -235,7 +239,7 @@ static int test_find_delalloc(void)
end = 0;
/*
* Currently if we fail to find dirty pages in the delalloc range we
- * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search. If
+ * will adjust max_bytes down to PAGE_SIZE and then re-search. If
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
@@ -245,9 +249,9 @@ static int test_find_delalloc(void)
test_msg("Didn't find our range\n");
goto out_bits;
}
- if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+ if (start != test_start && end != test_start + PAGE_SIZE - 1) {
test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
- test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+ test_start, test_start + PAGE_SIZE - 1, start,
end);
goto out_bits;
}
@@ -258,18 +262,149 @@ static int test_find_delalloc(void)
}
ret = 0;
out_bits:
- clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
+ clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL);
out:
if (locked_page)
- page_cache_release(locked_page);
+ put_page(locked_page);
process_page_range(inode, 0, total_dirty - 1,
PROCESS_UNLOCK | PROCESS_RELEASE);
iput(inode);
return ret;
}
+static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
+ unsigned long len)
+{
+ unsigned long i, x;
+
+ memset(bitmap, 0, len);
+ memset_extent_buffer(eb, 0, 0, len);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Bitmap was not zeroed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Setting all bits failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_clear(bitmap, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Clearing all bits failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Setting straddling pages failed\n");
+ return -EINVAL;
+ }
+
+ bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+ bitmap_clear(bitmap,
+ (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+ sizeof(long) * BITS_PER_BYTE);
+ extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+ extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0,
+ sizeof(long) * BITS_PER_BYTE);
+ if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+ test_msg("Clearing straddling pages failed\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Generate a wonky pseudo-random bit pattern for the sake of not using
+ * something repetitive that could miss some hypothetical off-by-n bug.
+ */
+ x = 0;
+ for (i = 0; i < len / sizeof(long); i++) {
+ x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL;
+ bitmap[i] = x;
+ }
+ write_extent_buffer(eb, bitmap, 0, len);
+
+ for (i = 0; i < len * BITS_PER_BYTE; i++) {
+ int bit, bit1;
+
+ bit = !!test_bit(i, bitmap);
+ bit1 = !!extent_buffer_test_bit(eb, 0, i);
+ if (bit1 != bit) {
+ test_msg("Testing bit pattern failed\n");
+ return -EINVAL;
+ }
+
+ bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
+ i % BITS_PER_BYTE);
+ if (bit1 != bit) {
+ test_msg("Testing bit pattern with offset failed\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int test_eb_bitmaps(void)
+{
+ unsigned long len = PAGE_SIZE * 4;
+ unsigned long *bitmap;
+ struct extent_buffer *eb;
+ int ret;
+
+ test_msg("Running extent buffer bitmap tests\n");
+
+ bitmap = kmalloc(len, GFP_KERNEL);
+ if (!bitmap) {
+ test_msg("Couldn't allocate test bitmap\n");
+ return -ENOMEM;
+ }
+
+ eb = __alloc_dummy_extent_buffer(NULL, 0, len);
+ if (!eb) {
+ test_msg("Couldn't allocate test extent buffer\n");
+ kfree(bitmap);
+ return -ENOMEM;
+ }
+
+ ret = __test_eb_bitmaps(bitmap, eb, len);
+ if (ret)
+ goto out;
+
+ /* Do it over again with an extent buffer which isn't page-aligned. */
+ free_extent_buffer(eb);
+ eb = __alloc_dummy_extent_buffer(NULL, PAGE_SIZE / 2, len);
+ if (!eb) {
+ test_msg("Couldn't allocate test extent buffer\n");
+ kfree(bitmap);
+ return -ENOMEM;
+ }
+
+ ret = __test_eb_bitmaps(bitmap, eb, len);
+out:
+ free_extent_buffer(eb);
+ kfree(bitmap);
+ return ret;
+}
+
int btrfs_test_extent_io(void)
{
- test_msg("Running find delalloc tests\n");
- return test_find_delalloc();
+ int ret;
+
+ test_msg("Running extent I/O tests\n");
+
+ ret = test_find_delalloc();
+ if (ret)
+ goto out;
+
+ ret = test_eb_bitmaps();
+out:
+ test_msg("Extent I/O tests finished\n");
+ return ret;
}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c8c3d70c31ff..514247515312 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -22,42 +22,7 @@
#include "../disk-io.h"
#include "../free-space-cache.h"
-#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
-static struct btrfs_block_group_cache *init_test_block_group(void)
-{
- struct btrfs_block_group_cache *cache;
-
- cache = kzalloc(sizeof(*cache), GFP_NOFS);
- if (!cache)
- return NULL;
- cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
- GFP_NOFS);
- if (!cache->free_space_ctl) {
- kfree(cache);
- return NULL;
- }
- cache->fs_info = btrfs_alloc_dummy_fs_info();
- if (!cache->fs_info) {
- kfree(cache->free_space_ctl);
- kfree(cache);
- return NULL;
- }
-
- cache->key.objectid = 0;
- cache->key.offset = 1024 * 1024 * 1024;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- cache->sectorsize = 4096;
- cache->full_stripe_len = 4096;
-
- spin_lock_init(&cache->lock);
- INIT_LIST_HEAD(&cache->list);
- INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->bg_list);
-
- btrfs_init_free_space_ctl(cache);
-
- return cache;
-}
+#define BITS_PER_BITMAP (PAGE_SIZE * 8)
/*
* This test just does basic sanity checking, making sure we can add an exten
@@ -71,59 +36,59 @@ static int test_extents(struct btrfs_block_group_cache *cache)
test_msg("Running extent only tests\n");
/* First just make sure we can remove an entire entry */
- ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error adding initial extents %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error removing extent %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_4M)) {
test_msg("Full remove left some lingering space\n");
return -1;
}
/* Ok edge and middle cases now */
- ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_add_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error adding half extent %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M);
if (ret) {
test_msg("Error removing tail end %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
test_msg("Error removing front end %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+ ret = btrfs_remove_free_space(cache, SZ_2M, 4096);
if (ret) {
test_msg("Error removing middle piece %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_1M)) {
test_msg("Still have space at the front\n");
return -1;
}
- if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) {
+ if (test_check_exists(cache, SZ_2M, 4096)) {
test_msg("Still have space in the middle\n");
return -1;
}
- if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) {
test_msg("Still have space at the end\n");
return -1;
}
@@ -141,30 +106,30 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
test_msg("Running bitmap only tests\n");
- ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
test_msg("Couldn't create a bitmap entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_4M);
if (ret) {
test_msg("Error removing bitmap full range %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_4M)) {
test_msg("Left some space in bitmap\n");
return -1;
}
- ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add to our bitmap entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M);
if (ret) {
test_msg("Couldn't remove middle chunk %d\n", ret);
return ret;
@@ -177,23 +142,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
/* Test a bit straddling two bitmaps */
- ret = test_add_free_space_entry(cache, next_bitmap_offset -
- (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
+ SZ_4M, 1);
if (ret) {
test_msg("Couldn't add space that straddles two bitmaps %d\n",
ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, next_bitmap_offset -
- (1 * 1024 * 1024), 2 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M);
if (ret) {
test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
- 2 * 1024 * 1024)) {
+ if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) {
test_msg("Left some space when removing overlapping\n");
return -1;
}
@@ -216,43 +179,43 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* bitmap, but the free space completely in the extent and then
* completely in the bitmap.
*/
- ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1);
if (ret) {
test_msg("Couldn't create bitmap entry %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_1M);
if (ret) {
test_msg("Couldn't remove extent entry %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_1M)) {
test_msg("Left remnants after our remove\n");
return -1;
}
/* Now to add back the extent entry and remove from the bitmap */
- ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
if (ret) {
test_msg("Couldn't re-add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M);
if (ret) {
test_msg("Couldn't remove from bitmap %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+ if (test_check_exists(cache, SZ_4M, SZ_1M)) {
test_msg("Left remnants in the bitmap\n");
return -1;
}
@@ -261,19 +224,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* Ok so a little more evil, extent entry and bitmap at the same offset,
* removing an overlapping chunk.
*/
- ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add to a bitmap %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M);
if (ret) {
test_msg("Couldn't remove overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+ if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) {
test_msg("Left over pieces after removing overlapping\n");
return -1;
}
@@ -281,25 +244,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
__btrfs_remove_free_space_cache(cache->free_space_ctl);
/* Now with the extent entry offset into the bitmap */
- ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add space to the bitmap %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0);
if (ret) {
test_msg("Couldn't add extent to the cache %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M);
if (ret) {
test_msg("Problem removing overlapping space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+ if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) {
test_msg("Left something behind when removing space");
return -1;
}
@@ -315,29 +278,26 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* [ del ]
*/
__btrfs_remove_free_space_cache(cache->free_space_ctl);
- ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
- 4 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
if (ret) {
test_msg("Couldn't add bitmap %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
- 5 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M,
+ 5 * SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
- 5 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M);
if (ret) {
test_msg("Failed to free our space %d\n", ret);
return ret;
}
- if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
- 5 * 1024 * 1024)) {
+ if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) {
test_msg("Left stuff over\n");
return -1;
}
@@ -350,19 +310,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
* to return -EAGAIN back from btrfs_remove_extent, make sure this
* doesn't happen.
*/
- ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
}
- ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+ ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
- ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+ ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M);
if (ret) {
test_msg("Error removing bitmap and extent overlapping %d\n", ret);
return ret;
@@ -445,9 +405,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
int ret;
u64 offset;
u64 max_extent_size;
-
- bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
- struct btrfs_free_space *);
+ const struct btrfs_free_space_op test_free_space_ops = {
+ .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds,
+ .use_bitmap = test_use_bitmap,
+ };
+ const struct btrfs_free_space_op *orig_free_space_ops;
test_msg("Running space stealing from bitmap to extent\n");
@@ -469,22 +431,21 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that forces use of bitmaps as soon as we have at least 1
* extent entry.
*/
- use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
- cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+ orig_free_space_ops = cache->free_space_ctl->op;
+ cache->free_space_ctl->op = &test_free_space_ops;
/*
* Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
*/
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
- 128 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
/* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
- 128 * 1024 * 1024 - 512 * 1024, 1);
+ ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K,
+ SZ_128M - SZ_512K, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
@@ -502,21 +463,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* [128Mb + 512Kb, 128Mb + 768Kb[
*/
ret = btrfs_remove_free_space(cache,
- 128 * 1024 * 1024 + 768 * 1024,
- 128 * 1024 * 1024 - 768 * 1024);
+ SZ_128M + 768 * SZ_1K,
+ SZ_128M - 768 * SZ_1K);
if (ret) {
test_msg("Failed to free part of bitmap space %d\n", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
- if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
- 256 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
@@ -525,8 +484,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
* as free anymore.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
- 128 * 1024 * 1024 - 768 * 1024)) {
+ if (test_check_exists(cache, SZ_128M + 768 * SZ_1K,
+ SZ_128M - 768 * SZ_1K)) {
test_msg("Bitmap region not removed from space cache\n");
return -EINVAL;
}
@@ -535,8 +494,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
* covered by the bitmap, isn't marked as free.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
- 256 * 1024)) {
+ if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -545,8 +503,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
* by the bitmap too, isn't marked as free either.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024,
- 256 * 1024)) {
+ if (test_check_exists(cache, SZ_128M, SZ_256K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -556,13 +513,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* lets make sure the free space cache marks it as free in the bitmap,
* and doesn't insert a new extent entry to represent this region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M, SZ_512K)) {
test_msg("Bitmap region not marked as free\n");
return -ENOENT;
}
@@ -581,8 +538,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
- 4096);
+ ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -601,15 +557,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* expand the range covered by the existing extent entry that represents
* the free space [128Mb - 256Kb, 128Mb - 128Kb[.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
- 128 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) {
test_msg("Extent region not marked as free\n");
return -ENOENT;
}
@@ -637,21 +591,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that represents the 1Mb free space, and therefore we're able to
* allocate the whole free space at once.
*/
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
- 1 * 1024 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) {
test_msg("Expected region not marked as free\n");
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+ if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) {
test_msg("Cache free space is not 1Mb + 4Kb\n");
return -EINVAL;
}
offset = btrfs_find_space_for_alloc(cache,
- 0, 1 * 1024 * 1024, 0,
+ 0, SZ_1M, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+ if (offset != (SZ_128M - SZ_256K)) {
test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -670,7 +623,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
offset = btrfs_find_space_for_alloc(cache,
0, 4096, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+ if (offset != (SZ_128M + SZ_16M)) {
test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -691,16 +644,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
/*
* Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
*/
- ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
- 128 * 1024, 0);
+ ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0);
if (ret) {
test_msg("Couldn't add extent entry %d\n", ret);
return ret;
}
/* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
- ret = test_add_free_space_entry(cache, 0,
- 128 * 1024 * 1024 - 512 * 1024, 1);
+ ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1);
if (ret) {
test_msg("Couldn't add bitmap entry %d\n", ret);
return ret;
@@ -717,22 +668,18 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* [128Mb + 128b, 128Mb + 256Kb[
* [128Mb - 768Kb, 128Mb - 512Kb[
*/
- ret = btrfs_remove_free_space(cache,
- 0,
- 128 * 1024 * 1024 - 768 * 1024);
+ ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K);
if (ret) {
test_msg("Failed to free part of bitmap space %d\n", ret);
return ret;
}
/* Confirm that only those 2 ranges are marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
- 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
- 256 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) {
test_msg("Free space range missing\n");
return -ENOENT;
}
@@ -741,8 +688,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
* as free anymore.
*/
- if (test_check_exists(cache, 0,
- 128 * 1024 * 1024 - 768 * 1024)) {
+ if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) {
test_msg("Bitmap region not removed from space cache\n");
return -EINVAL;
}
@@ -751,8 +697,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* Confirm that the region [128Mb - 512Kb, 128Mb[, which is
* covered by the bitmap, isn't marked as free.
*/
- if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024)) {
+ if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
test_msg("Invalid bitmap region marked as free\n");
return -EINVAL;
}
@@ -762,15 +707,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* lets make sure the free space cache marks it as free in the bitmap,
* and doesn't insert a new extent entry to represent this region.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
- 512 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
test_msg("Bitmap region not marked as free\n");
return -ENOENT;
}
@@ -789,7 +732,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* The goal is to test that the bitmap entry space stealing doesn't
* steal this space region.
*/
- ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+ ret = btrfs_add_free_space(cache, SZ_32M, 8192);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
@@ -800,13 +743,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* expand the range covered by the existing extent entry that represents
* the free space [128Mb + 128Kb, 128Mb + 256Kb[.
*/
- ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+ ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K);
if (ret) {
test_msg("Error adding free space: %d\n", ret);
return ret;
}
/* Confirm the region is marked as free. */
- if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M, SZ_128K)) {
test_msg("Extent region not marked as free\n");
return -ENOENT;
}
@@ -834,21 +777,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
* that represents the 1Mb free space, and therefore we're able to
* allocate the whole free space at once.
*/
- if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
- 1 * 1024 * 1024)) {
+ if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) {
test_msg("Expected region not marked as free\n");
return -ENOENT;
}
- if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+ if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) {
test_msg("Cache free space is not 1Mb + 8Kb\n");
return -EINVAL;
}
- offset = btrfs_find_space_for_alloc(cache,
- 0, 1 * 1024 * 1024, 0,
+ offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0,
&max_extent_size);
- if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+ if (offset != (SZ_128M - 768 * SZ_1K)) {
test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -867,7 +808,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
offset = btrfs_find_space_for_alloc(cache,
0, 8192, 0,
&max_extent_size);
- if (offset != (32 * 1024 * 1024)) {
+ if (offset != SZ_32M) {
test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
offset);
return -EINVAL;
@@ -877,7 +818,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
if (ret)
return ret;
- cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+ cache->free_space_ctl->op = orig_free_space_ops;
__btrfs_remove_free_space_cache(cache->free_space_ctl);
return 0;
@@ -891,15 +832,17 @@ int btrfs_test_free_space_cache(void)
test_msg("Running btrfs free space cache tests\n");
- cache = init_test_block_group();
+ cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024);
if (!cache) {
test_msg("Couldn't run the tests\n");
return 0;
}
root = btrfs_alloc_dummy_root();
- if (!root)
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
goto out;
+ }
root->fs_info = btrfs_alloc_dummy_fs_info();
if (!root->fs_info)
@@ -920,9 +863,7 @@ int btrfs_test_free_space_cache(void)
ret = test_steal_space_from_bitmap_to_extent(cache);
out:
- __btrfs_remove_free_space_cache(cache->free_space_ctl);
- kfree(cache->free_space_ctl);
- kfree(cache);
+ btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
test_msg("Free space cache tests finished\n");
return ret;
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
new file mode 100644
index 000000000000..7cea4462acd5
--- /dev/null
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -0,0 +1,572 @@
+/*
+ * Copyright (C) 2015 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../disk-io.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
+
+struct free_space_extent {
+ u64 start, length;
+};
+
+/*
+ * The test cases align their operations to this in order to hit some of the
+ * edge cases in the bitmap code.
+ */
+#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096)
+
+static int __check_free_space_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path,
+ struct free_space_extent *extents,
+ unsigned int num_extents)
+{
+ struct btrfs_free_space_info *info;
+ struct btrfs_key key;
+ int prev_bit = 0, bit;
+ u64 extent_start = 0, offset, end;
+ u32 flags, extent_count;
+ unsigned int i;
+ int ret;
+
+ info = search_free_space_info(trans, fs_info, cache, path, 0);
+ if (IS_ERR(info)) {
+ test_msg("Could not find free space info\n");
+ ret = PTR_ERR(info);
+ goto out;
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+ if (extent_count != num_extents) {
+ test_msg("Extent count is wrong\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ if (path->slots[0] != 0)
+ goto invalid;
+ end = cache->key.objectid + cache->key.offset;
+ i = 0;
+ while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY)
+ goto invalid;
+ offset = key.objectid;
+ while (offset < key.objectid + key.offset) {
+ bit = free_space_test_bit(cache, path, offset);
+ if (prev_bit == 0 && bit == 1) {
+ extent_start = offset;
+ } else if (prev_bit == 1 && bit == 0) {
+ if (i >= num_extents)
+ goto invalid;
+ if (i >= num_extents ||
+ extent_start != extents[i].start ||
+ offset - extent_start != extents[i].length)
+ goto invalid;
+ i++;
+ }
+ prev_bit = bit;
+ offset += cache->sectorsize;
+ }
+ }
+ if (prev_bit == 1) {
+ if (i >= num_extents ||
+ extent_start != extents[i].start ||
+ end - extent_start != extents[i].length)
+ goto invalid;
+ i++;
+ }
+ if (i != num_extents)
+ goto invalid;
+ } else {
+ if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 ||
+ path->slots[0] != 0)
+ goto invalid;
+ for (i = 0; i < num_extents; i++) {
+ path->slots[0]++;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY ||
+ key.objectid != extents[i].start ||
+ key.offset != extents[i].length)
+ goto invalid;
+ }
+ }
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
+invalid:
+ test_msg("Free space tree is invalid\n");
+ ret = -EINVAL;
+ goto out;
+}
+
+static int check_free_space_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path,
+ struct free_space_extent *extents,
+ unsigned int num_extents)
+{
+ struct btrfs_free_space_info *info;
+ u32 flags;
+ int ret;
+
+ info = search_free_space_info(trans, fs_info, cache, path, 0);
+ if (IS_ERR(info)) {
+ test_msg("Could not find free space info\n");
+ btrfs_release_path(path);
+ return PTR_ERR(info);
+ }
+ flags = btrfs_free_space_flags(path->nodes[0], info);
+ btrfs_release_path(path);
+
+ ret = __check_free_space_extents(trans, fs_info, cache, path, extents,
+ num_extents);
+ if (ret)
+ return ret;
+
+ /* Flip it to the other format and check that for good measure. */
+ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ ret = convert_free_space_to_extents(trans, fs_info, cache, path);
+ if (ret) {
+ test_msg("Could not convert to extents\n");
+ return ret;
+ }
+ } else {
+ ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path);
+ if (ret) {
+ test_msg("Could not convert to bitmaps\n");
+ return ret;
+ }
+ }
+ return __check_free_space_extents(trans, fs_info, cache, path, extents,
+ num_extents);
+}
+
+static int test_empty_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, cache->key.offset},
+ };
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_all(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {};
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_beginning(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid + BITMAP_RANGE,
+ cache->key.offset - BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+
+}
+
+static int test_remove_end(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, cache->key.offset - BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid +
+ cache->key.offset - BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_middle(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, BITMAP_RANGE},
+ {cache->key.objectid + 2 * BITMAP_RANGE,
+ cache->key.offset - 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_left(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_right(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_both(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, 3 * BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_none(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *cache,
+ struct btrfs_path *path)
+{
+ struct free_space_extent extents[] = {
+ {cache->key.objectid, BITMAP_RANGE},
+ {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE},
+ {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE},
+ };
+ int ret;
+
+ ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid,
+ cache->key.offset);
+ if (ret) {
+ test_msg("Could not remove free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid, BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 4 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+ cache->key.objectid + 2 * BITMAP_RANGE,
+ BITMAP_RANGE);
+ if (ret) {
+ test_msg("Could not add free space\n");
+ return ret;
+ }
+
+ return check_free_space_extents(trans, fs_info, cache, path,
+ extents, ARRAY_SIZE(extents));
+}
+
+typedef int (*test_func_t)(struct btrfs_trans_handle *,
+ struct btrfs_fs_info *,
+ struct btrfs_block_group_cache *,
+ struct btrfs_path *);
+
+static int run_test(test_func_t test_func, int bitmaps)
+{
+ struct btrfs_root *root = NULL;
+ struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_trans_handle trans;
+ struct btrfs_path *path = NULL;
+ int ret;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate dummy root\n");
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
+ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
+ root->fs_info->free_space_root = root;
+ root->fs_info->tree_root = root;
+
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 8192;
+
+ cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE);
+ if (!cache) {
+ test_msg("Couldn't allocate dummy block group cache\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ cache->bitmap_low_thresh = 0;
+ cache->bitmap_high_thresh = (u32)-1;
+ cache->needs_free_space = 1;
+ cache->fs_info = root->fs_info;
+
+ btrfs_init_dummy_trans(&trans);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ ret = add_block_group_free_space(&trans, root->fs_info, cache);
+ if (ret) {
+ test_msg("Could not add block group free space\n");
+ goto out;
+ }
+
+ if (bitmaps) {
+ ret = convert_free_space_to_bitmaps(&trans, root->fs_info,
+ cache, path);
+ if (ret) {
+ test_msg("Could not convert block group to bitmaps\n");
+ goto out;
+ }
+ }
+
+ ret = test_func(&trans, root->fs_info, cache, path);
+ if (ret)
+ goto out;
+
+ ret = remove_block_group_free_space(&trans, root->fs_info, cache);
+ if (ret) {
+ test_msg("Could not remove block group free space\n");
+ goto out;
+ }
+
+ if (btrfs_header_nritems(root->node) != 0) {
+ test_msg("Free space tree has leftover items\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ btrfs_free_dummy_block_group(cache);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
+static int run_test_both_formats(test_func_t test_func)
+{
+ int ret;
+
+ ret = run_test(test_func, 0);
+ if (ret)
+ return ret;
+ return run_test(test_func, 1);
+}
+
+int btrfs_test_free_space_tree(void)
+{
+ test_func_t tests[] = {
+ test_empty_block_group,
+ test_remove_all,
+ test_remove_beginning,
+ test_remove_end,
+ test_remove_middle,
+ test_merge_left,
+ test_merge_right,
+ test_merge_both,
+ test_merge_none,
+ };
+ int i;
+
+ test_msg("Running free space tree tests\n");
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ int ret = run_test_both_formats(tests[i]);
+ if (ret) {
+ test_msg("%pf failed\n", tests[i]);
+ return ret;
+ }
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 054fc0d97131..863a6a3af1f8 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -22,6 +22,7 @@
#include "../disk-io.h"
#include "../extent_io.h"
#include "../volumes.h"
+#include "../compression.h"
static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
u64 ram_bytes, u64 offset, u64 disk_bytenr,
@@ -100,7 +101,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
static void setup_file_extents(struct btrfs_root *root)
{
int slot = 0;
- u64 disk_bytenr = 1 * 1024 * 1024;
+ u64 disk_bytenr = SZ_1M;
u64 offset = 0;
/* First we want a hole */
@@ -974,7 +975,7 @@ static int test_extent_accounting(void)
(BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
EXTENT_DELALLOC | EXTENT_DIRTY |
EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
- NULL, GFP_NOFS);
+ NULL, GFP_KERNEL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1045,7 +1046,7 @@ static int test_extent_accounting(void)
BTRFS_MAX_EXTENT_SIZE+8191,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
- NULL, GFP_NOFS);
+ NULL, GFP_KERNEL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1079,7 +1080,7 @@ static int test_extent_accounting(void)
ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
- NULL, GFP_NOFS);
+ NULL, GFP_KERNEL);
if (ret) {
test_msg("clear_extent_bit returned %d\n", ret);
goto out;
@@ -1096,7 +1097,7 @@ out:
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
- NULL, GFP_NOFS);
+ NULL, GFP_KERNEL);
iput(inode);
btrfs_free_dummy_root(root);
return ret;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 846d277b1901..8ea5d34bc5a2 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -23,14 +23,6 @@
#include "../qgroup.h"
#include "../backref.h"
-static void init_dummy_trans(struct btrfs_trans_handle *trans)
-{
- memset(trans, 0, sizeof(*trans));
- trans->transid = 1;
- INIT_LIST_HEAD(&trans->qgroup_ref_list);
- trans->type = __TRANS_DUMMY;
-}
-
static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u64 num_bytes, u64 parent, u64 root_objectid)
{
@@ -44,7 +36,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
ins.objectid = bytenr;
ins.type = BTRFS_EXTENT_ITEM_KEY;
@@ -94,7 +86,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
u64 refs;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -144,7 +136,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
struct btrfs_path *path;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -178,7 +170,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
u64 refs;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -232,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
struct ulist *new_roots = NULL;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
test_msg("Qgroup basic add\n");
ret = btrfs_create_qgroup(NULL, fs_info, 5);
@@ -326,7 +318,7 @@ static int test_multiple_refs(struct btrfs_root *root)
struct ulist *new_roots = NULL;
int ret;
- init_dummy_trans(&trans);
+ btrfs_init_dummy_trans(&trans);
test_msg("Qgroup multiple refs test\n");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 418c6a2ad7d8..43885e51b882 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
list_del_init(&em->list);
free_extent_map(em);
}
+ /*
+ * If any block groups are found in ->deleted_bgs then it's
+ * because the transaction was aborted and a commit did not
+ * happen (things failed before writing the new superblock
+ * and calling btrfs_finish_extent_commit()), so we can not
+ * discard the physical locations of the block groups.
+ */
+ while (!list_empty(&transaction->deleted_bgs)) {
+ struct btrfs_block_group_cache *cache;
+
+ cache = list_first_entry(&transaction->deleted_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
+ list_del_init(&cache->bg_list);
+ btrfs_put_block_group_trimming(cache);
+ btrfs_put_block_group(cache);
+ }
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
}
@@ -274,7 +291,6 @@ loop:
cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
- spin_lock_init(&cur_trans->deleted_bgs_lock);
spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
@@ -592,6 +608,40 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_ALL);
}
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items,
+ int min_factor)
+{
+ struct btrfs_trans_handle *trans;
+ u64 num_bytes;
+ int ret;
+
+ trans = btrfs_start_transaction(root, num_items);
+ if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+ return trans;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return trans;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+ ret = btrfs_cond_migrate_bytes(root->fs_info,
+ &root->fs_info->trans_block_rsv,
+ num_bytes,
+ min_factor);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ERR_PTR(ret);
+ }
+
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ trans->bytes_reserved = num_bytes;
+ trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trans->transid, num_bytes, 1);
+
+ return trans;
+}
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_root *root,
@@ -603,17 +653,20 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_JOIN, 0);
+ return start_transaction(root, 0, TRANS_JOIN,
+ BTRFS_RESERVE_NO_FLUSH);
}
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
+ return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
+ BTRFS_RESERVE_NO_FLUSH);
}
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_USERSPACE, 0);
+ return start_transaction(root, 0, TRANS_USERSPACE,
+ BTRFS_RESERVE_NO_FLUSH);
}
/*
@@ -631,7 +684,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
*/
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
{
- return start_transaction(root, 0, TRANS_ATTACH, 0);
+ return start_transaction(root, 0, TRANS_ATTACH,
+ BTRFS_RESERVE_NO_FLUSH);
}
/*
@@ -646,7 +700,8 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
- trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+ trans = start_transaction(root, 0, TRANS_ATTACH,
+ BTRFS_RESERVE_NO_FLUSH);
if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
btrfs_wait_for_commit(root, 0);
@@ -1280,7 +1335,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
- struct timespec cur_time = CURRENT_TIME;
+ struct timespec cur_time;
int ret = 0;
u64 to_reserve = 0;
u64 index = 0;
@@ -1288,17 +1343,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 root_flags;
uuid_le new_uuid;
- path = btrfs_alloc_path();
- if (!path) {
- pending->error = -ENOMEM;
- return 0;
- }
+ ASSERT(pending->path);
+ path = pending->path;
- new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
- if (!new_root_item) {
- pending->error = -ENOMEM;
- goto root_item_alloc_fail;
- }
+ ASSERT(pending->root_item);
+ new_root_item = pending->root_item;
pending->error = btrfs_find_free_objectid(tree_root, &objectid);
if (pending->error)
@@ -1328,12 +1377,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
trans->bytes_reserved = trans->block_rsv->reserved;
-
+ trace_btrfs_space_reservation(root->fs_info, "transaction",
+ trans->transid,
+ trans->bytes_reserved, 1);
dentry = pending->dentry;
parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
record_root_in_trans(trans, parent_root);
+ cur_time = current_fs_time(parent_inode->i_sb);
+
/*
* insert the directory item
*/
@@ -1476,7 +1529,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->i_size +
dentry->d_name.len * 2);
- parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+ parent_inode->i_mtime = parent_inode->i_ctime =
+ current_fs_time(parent_inode->i_sb);
ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -1531,8 +1585,10 @@ clear_skip_qgroup:
btrfs_clear_skip_qgroup(trans);
no_free_objectid:
kfree(new_root_item);
-root_item_alloc_fail:
+ pending->root_item = NULL;
btrfs_free_path(path);
+ pending->path = NULL;
+
return ret;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b05b2f64d913..72be51f7ca2f 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -77,8 +77,8 @@ struct btrfs_transaction {
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
+ /* Protected by spin lock fs_info->unused_bgs_lock. */
struct list_head deleted_bgs;
- spinlock_t deleted_bgs_lock;
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
@@ -137,8 +137,10 @@ struct btrfs_pending_snapshot {
struct dentry *dentry;
struct inode *dir;
struct btrfs_root *root;
+ struct btrfs_root_item *root_item;
struct btrfs_root *snap;
struct btrfs_qgroup_inherit *inherit;
+ struct btrfs_path *path;
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv;
u64 qgroup_reserved;
@@ -185,6 +187,10 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
unsigned int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+ struct btrfs_root *root,
+ unsigned int num_items,
+ int min_factor);
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
struct btrfs_root *root,
unsigned int num_items);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f31db4325339..cb65089127cc 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
}
btrfs_release_path(path);
+ /*
+ * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
+ * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
+ * a deadlock (attempting to write lock an already write locked leaf).
+ */
+ path->lowest_level = 1;
wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (wret < 0) {
@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
ret = 0;
goto out;
}
- path->slots[1] = btrfs_header_nritems(path->nodes[1]);
- next_key_ret = btrfs_find_next_key(root, path, &key, 1,
- min_trans);
+ /*
+ * The node at level 1 must always be locked when our path has
+ * keep_locks set and lowest_level is 1, regardless of the value of
+ * path->slots[1].
+ */
+ BUG_ON(path->locks[1] == 0);
ret = btrfs_realloc_node(trans, root,
path->nodes[1], 0,
&last_ret,
@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
WARN_ON(ret == -EAGAIN);
goto out;
}
+ /*
+ * Now that we reallocated the node we can find the next key. Note that
+ * btrfs_find_next_key() can release our path and do another search
+ * without COWing, this is because even with path->keep_locks = 1,
+ * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
+ * node when path->slots[node_level - 1] does not point to the last
+ * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
+ * we search for the next key after reallocating our node.
+ */
+ path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+ next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+ min_trans);
if (next_key_ret == 0) {
memcpy(&root->defrag_progress, &key, sizeof(key));
ret = -EAGAIN;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 323e12cc9d2f..517d0ccb351e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,6 +26,7 @@
#include "print-tree.h"
#include "backref.h"
#include "hash.h"
+#include "compression.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -1045,7 +1046,7 @@ again:
/*
* NOTE: we have searched root tree and checked the
- * coresponding ref, it does not need to check again.
+ * corresponding ref, it does not need to check again.
*/
*search_done = 1;
}
@@ -4127,7 +4128,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *path,
struct list_head *logged_list,
- struct btrfs_log_ctx *ctx)
+ struct btrfs_log_ctx *ctx,
+ const u64 start,
+ const u64 end)
{
struct extent_map *em, *n;
struct list_head extents;
@@ -4166,7 +4169,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
}
list_sort(NULL, &extents, extent_cmp);
-
+ /*
+ * Collect any new ordered extents within the range. This is to
+ * prevent logging file extent items without waiting for the disk
+ * location they point to being written. We do this only to deal
+ * with races against concurrent lockless direct IO writes.
+ */
+ btrfs_get_logged_extents(inode, logged_list, start, end);
process:
while (!list_empty(&extents)) {
em = list_entry(extents.next, struct extent_map, list);
@@ -4406,6 +4415,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
return ret;
}
+/*
+ * When we are logging a new inode X, check if it doesn't have a reference that
+ * matches the reference from some other inode Y created in a past transaction
+ * and that was renamed in the current transaction. If we don't do this, then at
+ * log replay time we can lose inode Y (and all its files if it's a directory):
+ *
+ * mkdir /mnt/x
+ * echo "hello world" > /mnt/x/foobar
+ * sync
+ * mv /mnt/x /mnt/y
+ * mkdir /mnt/x # or touch /mnt/x
+ * xfs_io -c fsync /mnt/x
+ * <power fail>
+ * mount fs, trigger log replay
+ *
+ * After the log replay procedure, we would lose the first directory and all its
+ * files (file foobar).
+ * For the case where inode Y is not a directory we simply end up losing it:
+ *
+ * echo "123" > /mnt/foo
+ * sync
+ * mv /mnt/foo /mnt/bar
+ * echo "abc" > /mnt/foo
+ * xfs_io -c fsync /mnt/foo
+ * <power fail>
+ *
+ * We also need this for cases where a snapshot entry is replaced by some other
+ * entry (file or directory) otherwise we end up with an unreplayable log due to
+ * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
+ * if it were a regular entry:
+ *
+ * mkdir /mnt/x
+ * btrfs subvolume snapshot /mnt /mnt/x/snap
+ * btrfs subvolume delete /mnt/x/snap
+ * rmdir /mnt/x
+ * mkdir /mnt/x
+ * fsync /mnt/x or fsync some new file inside it
+ * <power fail>
+ *
+ * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
+ * the same transaction.
+ */
+static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ const int slot,
+ const struct btrfs_key *key,
+ struct inode *inode)
+{
+ int ret;
+ struct btrfs_path *search_path;
+ char *name = NULL;
+ u32 name_len = 0;
+ u32 item_size = btrfs_item_size_nr(eb, slot);
+ u32 cur_offset = 0;
+ unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
+
+ search_path = btrfs_alloc_path();
+ if (!search_path)
+ return -ENOMEM;
+ search_path->search_commit_root = 1;
+ search_path->skip_locking = 1;
+
+ while (cur_offset < item_size) {
+ u64 parent;
+ u32 this_name_len;
+ u32 this_len;
+ unsigned long name_ptr;
+ struct btrfs_dir_item *di;
+
+ if (key->type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *iref;
+
+ iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+ parent = key->offset;
+ this_name_len = btrfs_inode_ref_name_len(eb, iref);
+ name_ptr = (unsigned long)(iref + 1);
+ this_len = sizeof(*iref) + this_name_len;
+ } else {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)(ptr +
+ cur_offset);
+ parent = btrfs_inode_extref_parent(eb, extref);
+ this_name_len = btrfs_inode_extref_name_len(eb, extref);
+ name_ptr = (unsigned long)&extref->name;
+ this_len = sizeof(*extref) + this_name_len;
+ }
+
+ if (this_name_len > name_len) {
+ char *new_name;
+
+ new_name = krealloc(name, this_name_len, GFP_NOFS);
+ if (!new_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ name_len = this_name_len;
+ name = new_name;
+ }
+
+ read_extent_buffer(eb, name, name_ptr, this_name_len);
+ di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root,
+ search_path, parent,
+ name, this_name_len, 0);
+ if (di && !IS_ERR(di)) {
+ ret = 1;
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+ btrfs_release_path(search_path);
+
+ cur_offset += this_len;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(search_path);
+ kfree(name);
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -4492,7 +4622,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
- btrfs_get_logged_extents(inode, &logged_list, start, end);
+ /*
+ * Collect ordered extents only if we are logging data. This is to
+ * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
+ * will process the ordered extents if they still exists at the time,
+ * because when we collect them we test and set for the flag
+ * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
+ * same ordered extents. The consequence for the LOG_INODE_ALL log mode
+ * not processing the ordered extents is that we end up logging the
+ * corresponding file extent items, based on the extent maps in the
+ * inode's extent_map_tree's modified_list, without logging the
+ * respective checksums (since the may still be only attached to the
+ * ordered extents and have not been inserted in the csum tree by
+ * btrfs_finish_ordered_io() yet).
+ */
+ if (inode_only == LOG_INODE_ALL)
+ btrfs_get_logged_extents(inode, &logged_list, start, end);
/*
* a brute force approach to making sure we get the most uptodate
@@ -4578,6 +4723,22 @@ again:
if (min_key.type == BTRFS_INODE_ITEM_KEY)
need_log_inode_item = false;
+ if ((min_key.type == BTRFS_INODE_REF_KEY ||
+ min_key.type == BTRFS_INODE_EXTREF_KEY) &&
+ BTRFS_I(inode)->generation == trans->transid) {
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0],
+ &min_key, inode);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ } else if (ret > 0) {
+ err = 1;
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ goto out_unlock;
+ }
+ }
+
/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
if (ins_nr == 0)
@@ -4701,7 +4862,7 @@ log_extents:
goto out_unlock;
}
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- &logged_list, ctx);
+ &logged_list, ctx, start, end);
if (ret) {
err = ret;
goto out_unlock;
@@ -4764,6 +4925,42 @@ out_unlock:
}
/*
+ * Check if we must fallback to a transaction commit when logging an inode.
+ * This must be called after logging the inode and is used only in the context
+ * when fsyncing an inode requires the need to log some other inode - in which
+ * case we can't lock the i_mutex of each other inode we need to log as that
+ * can lead to deadlocks with concurrent fsync against other inodes (as we can
+ * log inodes up or down in the hierarchy) or rename operations for example. So
+ * we take the log_mutex of the inode after we have logged it and then check for
+ * its last_unlink_trans value - this is safe because any task setting
+ * last_unlink_trans must take the log_mutex and it must do this before it does
+ * the actual unlink operation, so if we do this check before a concurrent task
+ * sets last_unlink_trans it means we've logged a consistent version/state of
+ * all the inode items, otherwise we are not sure and must do a transaction
+ * commit (the concurrent task migth have only updated last_unlink_trans before
+ * we logged the inode or it might have also done the unlink).
+ */
+static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ bool ret = false;
+
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
+ if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
+ /*
+ * Make sure any commits to the log are forced to be full
+ * commits.
+ */
+ btrfs_set_log_full_commit(fs_info, trans);
+ ret = true;
+ }
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+ return ret;
+}
+
+/*
* follow the dentry parent pointers up the chain and see if any
* of the directories in it require a full commit before they can
* be logged. Returns zero if nothing special needs to be done or 1 if
@@ -4776,7 +4973,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
u64 last_committed)
{
int ret = 0;
- struct btrfs_root *root;
struct dentry *old_parent = NULL;
struct inode *orig_inode = inode;
@@ -4808,14 +5004,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
BTRFS_I(inode)->logged_trans = trans->transid;
smp_mb();
- if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
- root = BTRFS_I(inode)->root;
-
- /*
- * make sure any commits to the log are forced
- * to be full commits
- */
- btrfs_set_log_full_commit(root->fs_info, trans);
+ if (btrfs_must_commit_transaction(trans, inode)) {
ret = 1;
break;
}
@@ -4974,6 +5163,9 @@ process_leaf:
btrfs_release_path(path);
ret = btrfs_log_inode(trans, root, di_inode,
log_mode, 0, LLONG_MAX, ctx);
+ if (!ret &&
+ btrfs_must_commit_transaction(trans, di_inode))
+ ret = 1;
iput(di_inode);
if (ret)
goto next_dir_inode;
@@ -5088,6 +5280,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, root, dir_inode,
LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ if (!ret &&
+ btrfs_must_commit_transaction(trans, dir_inode))
+ ret = 1;
iput(dir_inode);
if (ret)
goto out;
@@ -5439,6 +5634,9 @@ error:
* They revolve around files there were unlinked from the directory, and
* this function updates the parent directory so that a full commit is
* properly done if it is fsync'd later after the unlinks are done.
+ *
+ * Must be called before the unlink operations (updates to the subvolume tree,
+ * inodes, etc) are done.
*/
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode,
@@ -5454,8 +5652,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
* into the file. When the file is logged we check it and
* don't log the parents if the file is fully on disk.
*/
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode)) {
+ mutex_lock(&BTRFS_I(inode)->log_mutex);
BTRFS_I(inode)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
+ }
/*
* if this directory was already logged any new
@@ -5486,7 +5687,29 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
return;
record:
+ mutex_lock(&BTRFS_I(dir)->log_mutex);
BTRFS_I(dir)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
+}
+
+/*
+ * Make sure that if someone attempts to fsync the parent directory of a deleted
+ * snapshot, it ends up triggering a transaction commit. This is to guarantee
+ * that after replaying the log tree of the parent directory's root we will not
+ * see the snapshot anymore and at log replay time we will not see any log tree
+ * corresponding to the deleted snapshot's root, which could lead to replaying
+ * it after replaying the log tree of the parent directory (which would replay
+ * the snapshot delete operation).
+ *
+ * Must be called before the actual snapshot destroy operation (updates to the
+ * parent root and tree of tree roots trees, etc) are done.
+ */
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+ struct inode *dir)
+{
+ mutex_lock(&BTRFS_I(dir)->log_mutex);
+ BTRFS_I(dir)->last_unlink_trans = trans->transid;
+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
}
/*
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 6916a781ea02..a9f1b75d080d 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -79,6 +79,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct inode *dir, struct inode *inode,
int for_rename);
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+ struct inode *dir);
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *old_dir,
struct dentry *parent);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 17ed76d18eb6..bd0f45fb38c4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
},
};
-const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
[BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
[BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
@@ -125,6 +125,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static void btrfs_close_one_device(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -137,7 +138,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
{
struct btrfs_fs_devices *fs_devs;
- fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
+ fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
if (!fs_devs)
return ERR_PTR(-ENOMEM);
@@ -219,7 +220,7 @@ static struct btrfs_device *__alloc_device(void)
{
struct btrfs_device *dev;
- dev = kzalloc(sizeof(*dev), GFP_NOFS);
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev)
return ERR_PTR(-ENOMEM);
@@ -232,8 +233,9 @@ static struct btrfs_device *__alloc_device(void)
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+ btrfs_device_data_ordered_init(dev);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
return dev;
}
@@ -731,7 +733,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
* uuid mutex so nothing we touch in here is going to disappear.
*/
if (orig_dev->name) {
- name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+ name = rcu_string_strdup(orig_dev->name->str,
+ GFP_KERNEL);
if (!name) {
kfree(device);
goto error;
@@ -1022,16 +1025,16 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
}
/* make sure our super fits in the device */
- if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+ if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
goto error_bdev_put;
/* make sure our super fits in the page */
- if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+ if (sizeof(*disk_super) > PAGE_SIZE)
goto error_bdev_put;
/* make sure our super doesn't straddle pages on disk */
- index = bytenr >> PAGE_CACHE_SHIFT;
- if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+ index = bytenr >> PAGE_SHIFT;
+ if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
goto error_bdev_put;
/* pull in the page with our super */
@@ -1044,7 +1047,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
p = kmap(page);
/* align our pointer to the offset of the super block */
- disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+ disk_super = p + (bytenr & ~PAGE_MASK);
if (btrfs_super_bytenr(disk_super) != bytenr ||
btrfs_super_magic(disk_super) != BTRFS_MAGIC)
@@ -1072,7 +1075,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
error_unmap:
kunmap(page);
- page_cache_release(page);
+ put_page(page);
error_bdev_put:
blkdev_put(bdev, flags);
@@ -1102,7 +1105,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
key.objectid = device->devid;
key.offset = start;
@@ -1182,7 +1185,7 @@ again:
struct map_lookup *map;
int i;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
u64 end;
@@ -1257,6 +1260,15 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction,
int ret;
int slot;
struct extent_buffer *l;
+ u64 min_search_start;
+
+ /*
+ * We don't want to overwrite the superblock on the drive nor any area
+ * used by the boot loader (grub for example), so we make sure to start
+ * at an offset of at least 1MB.
+ */
+ min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ search_start = max(search_start, min_search_start);
path = btrfs_alloc_path();
if (!path)
@@ -1271,7 +1283,7 @@ again:
goto out;
}
- path->reada = 2;
+ path->reada = READA_FORWARD;
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -1397,18 +1409,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
- struct btrfs_root *root = device->dev_root;
- u64 search_start;
-
/* FIXME use last free of some kind */
-
- /*
- * we don't want to overwrite the superblock on the drive,
- * so we make sure to start at an offset of at least 1MB
- */
- search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
return find_free_dev_extent_start(trans->transaction, device,
- num_bytes, search_start, start, len);
+ num_bytes, 0, start, len);
}
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
@@ -1642,7 +1645,6 @@ static void update_dev_time(char *path_name)
return;
file_update_time(filp);
filp_close(filp, NULL);
- return;
}
static int btrfs_rm_dev_item(struct btrfs_root *root,
@@ -1713,12 +1715,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
} while (read_seqretry(&root->fs_info->profiles_lock, seq));
num_devices = root->fs_info->fs_devices->num_devices;
- btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+ btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
WARN_ON(num_devices < 1);
num_devices--;
}
- btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+ btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
@@ -1973,8 +1975,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->writeable) {
fs_devices->rw_devices--;
/* zero out the old super if it is writable */
- btrfs_scratch_superblocks(srcdev->bdev,
- rcu_str_deref(srcdev->name));
+ btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
}
if (srcdev->bdev)
@@ -2024,8 +2025,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
if (tgtdev->bdev) {
- btrfs_scratch_superblocks(tgtdev->bdev,
- rcu_str_deref(tgtdev->name));
+ btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
fs_info->fs_devices->open_devices--;
}
fs_info->fs_devices->num_devices--;
@@ -2288,7 +2288,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
goto error;
}
- name = rcu_string_strdup(device_path, GFP_NOFS);
+ name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
kfree(device);
ret = -ENOMEM;
@@ -2749,7 +2749,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
em->start + em->len < chunk_offset) {
/*
* This is a logic error, but we don't want to just rely on the
- * user having built with ASSERT enabled, so if ASSERT doens't
+ * user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
@@ -2757,7 +2757,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
free_extent_map(em);
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
lock_chunks(root->fs_info->chunk_root);
check_system_chunk(trans, extent_root, map->type);
unlock_chunks(root->fs_info->chunk_root);
@@ -2853,7 +2853,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
if (ret)
return ret;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_trans_remove_block_group(root->fs_info,
+ chunk_offset);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_std_error(root->fs_info, ret, NULL);
@@ -2966,7 +2967,7 @@ static int insert_balance_item(struct btrfs_root *root,
}
key.objectid = BTRFS_BALANCE_OBJECTID;
- key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.type = BTRFS_TEMPORARY_ITEM_KEY;
key.offset = 0;
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -3015,7 +3016,7 @@ static int del_balance_item(struct btrfs_root *root)
}
key.objectid = BTRFS_BALANCE_OBJECTID;
- key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.type = BTRFS_TEMPORARY_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
@@ -3123,7 +3124,7 @@ static int chunk_profiles_filter(u64 chunk_type,
return 1;
}
-static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_block_group_cache *cache;
@@ -3156,7 +3157,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
return ret;
}
-static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info,
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
u64 chunk_offset, struct btrfs_balance_args *bargs)
{
struct btrfs_block_group_cache *cache;
@@ -3400,13 +3401,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_data = 0;
u32 count_meta = 0;
u32 count_sys = 0;
+ int chunk_reserved = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
old_size = btrfs_device_get_total_bytes(device);
size_to_free = div_factor(old_size, 1);
- size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+ size_to_free = min_t(u64, size_to_free, SZ_1M);
if (!device->writeable ||
btrfs_device_get_total_bytes(device) -
btrfs_device_get_bytes_used(device) > size_to_free ||
@@ -3501,6 +3503,7 @@ again:
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
+
btrfs_release_path(path);
if (!ret) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3537,6 +3540,24 @@ again:
goto loop;
}
+ if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+ trans = btrfs_start_transaction(chunk_root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ ret = PTR_ERR(trans);
+ goto error;
+ }
+
+ ret = btrfs_force_chunk_alloc(trans, chunk_root,
+ BTRFS_BLOCK_GROUP_DATA);
+ btrfs_end_transaction(trans, chunk_root);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ goto error;
+ }
+ chunk_reserved = 1;
+ }
+
ret = btrfs_relocate_chunk(chunk_root,
found_key.offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3666,12 +3687,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_lock(&fs_info->dev_replace);
+ btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
BUG_ON(num_devices < 1);
num_devices--;
}
- btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
if (num_devices == 1)
allowed |= BTRFS_BLOCK_GROUP_DUP;
@@ -3704,14 +3725,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
goto out;
}
- /* allow dup'ed data chunks only in mixed mode */
- if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
- btrfs_err(fs_info, "dup for data is not allowed");
- ret = -EINVAL;
- goto out;
- }
-
/* allow to reduce meta or sys integrity only if force set */
allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
@@ -3737,6 +3750,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
} while (read_seqretry(&fs_info->profiles_lock, seq));
+ if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
+ btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
+ btrfs_warn(fs_info,
+ "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
+ bctl->meta.target, bctl->data.target);
+ }
+
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
fs_info->num_tolerated_disk_barrier_failures = min(
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
@@ -3848,7 +3868,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
return -ENOMEM;
key.objectid = BTRFS_BALANCE_OBJECTID;
- key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.type = BTRFS_TEMPORARY_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
@@ -4099,7 +4119,7 @@ out:
* Callback for btrfs_uuid_tree_iterate().
* returns:
* 0 check succeeded, the entry is not outdated.
- * < 0 if an error occured.
+ * < 0 if an error occurred.
* > 0 if the check failed, which means the caller shall remove the entry.
*/
static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
@@ -4249,7 +4269,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
lock_chunks(root);
@@ -4441,7 +4461,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
{
/* TODO allow them to set a preferred stripe size */
- return 64 * 1024;
+ return SZ_64K;
}
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
@@ -4509,21 +4529,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ncopies = btrfs_raid_array[index].ncopies;
if (type & BTRFS_BLOCK_GROUP_DATA) {
- max_stripe_size = 1024 * 1024 * 1024;
+ max_stripe_size = SZ_1G;
max_chunk_size = 10 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
- if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
- max_stripe_size = 1024 * 1024 * 1024;
+ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+ max_stripe_size = SZ_1G;
else
- max_stripe_size = 256 * 1024 * 1024;
+ max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- max_stripe_size = 32 * 1024 * 1024;
+ max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
if (!devs_max)
devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
@@ -4700,7 +4720,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
goto error;
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->bdev = (struct block_device *)map;
+ em->map_lookup = map;
em->start = start;
em->len = num_bytes;
em->block_start = 0;
@@ -4774,7 +4794,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
u64 dev_offset;
u64 stripe_size;
int i = 0;
- int ret;
+ int ret = 0;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
@@ -4795,7 +4815,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
stripe_size = em->orig_block_len;
@@ -4805,20 +4825,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * Take the device list mutex to prevent races with the final phase of
+ * a device replace operation that replaces the device object associated
+ * with the map's stripes, because the device object's id can change
+ * at any time during that final phase of the device replace operation
+ * (dev-replace.c:btrfs_dev_replace_finishing()).
+ */
+ mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
dev_offset = map->stripes[i].physical;
ret = btrfs_update_device(trans, device);
if (ret)
- goto out;
+ break;
ret = btrfs_alloc_dev_extent(trans, device,
chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
chunk_offset, dev_offset,
stripe_size);
if (ret)
- goto out;
+ break;
+ }
+ if (ret) {
+ mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
+ goto out;
}
stripe = &chunk->stripe;
@@ -4831,6 +4863,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
stripe++;
}
+ mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
btrfs_set_stack_chunk_length(chunk, chunk_size);
btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
@@ -4937,7 +4970,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
if (!em)
return 1;
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (map->stripes[i].dev->missing) {
miss_ndevs++;
@@ -5017,7 +5050,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return 1;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
@@ -5030,10 +5063,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
ret = 1;
free_extent_map(em);
- btrfs_dev_replace_lock(&fs_info->dev_replace);
+ btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
ret++;
- btrfs_dev_replace_unlock(&fs_info->dev_replace);
+ btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
return ret;
}
@@ -5053,7 +5086,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
free_extent_map(em);
@@ -5074,7 +5107,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
free_extent_map(em);
@@ -5233,7 +5266,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
return -EINVAL;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
offset = logical - em->start;
stripe_len = map->stripe_len;
@@ -5293,10 +5326,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (!bbio_ret)
goto out;
- btrfs_dev_replace_lock(dev_replace);
+ btrfs_dev_replace_lock(dev_replace, 0);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (!dev_replace_is_ongoing)
- btrfs_dev_replace_unlock(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 0);
+ else
+ btrfs_dev_replace_set_lock_blocking(dev_replace);
if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
@@ -5347,35 +5382,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
* target drive.
*/
for (i = 0; i < tmp_num_stripes; i++) {
- if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
- /*
- * In case of DUP, in order to keep it
- * simple, only add the mirror with the
- * lowest physical address
- */
- if (found &&
- physical_of_found <=
- tmp_bbio->stripes[i].physical)
- continue;
- index_srcdev = i;
- found = 1;
- physical_of_found =
- tmp_bbio->stripes[i].physical;
- }
+ if (tmp_bbio->stripes[i].dev->devid != srcdev_devid)
+ continue;
+
+ /*
+ * In case of DUP, in order to keep it simple, only add
+ * the mirror with the lowest physical address
+ */
+ if (found &&
+ physical_of_found <= tmp_bbio->stripes[i].physical)
+ continue;
+
+ index_srcdev = i;
+ found = 1;
+ physical_of_found = tmp_bbio->stripes[i].physical;
}
- if (found) {
- mirror_num = index_srcdev + 1;
- patch_the_first_stripe_for_dev_replace = 1;
- physical_to_patch_in_first_stripe = physical_of_found;
- } else {
+ btrfs_put_bbio(tmp_bbio);
+
+ if (!found) {
WARN_ON(1);
ret = -EIO;
- btrfs_put_bbio(tmp_bbio);
goto out;
}
- btrfs_put_bbio(tmp_bbio);
+ mirror_num = index_srcdev + 1;
+ patch_the_first_stripe_for_dev_replace = 1;
+ physical_to_patch_in_first_stripe = physical_of_found;
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
@@ -5721,8 +5754,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
bbio->mirror_num = map->num_stripes + 1;
}
out:
- if (dev_replace_is_ongoing)
- btrfs_dev_replace_unlock(dev_replace);
+ if (dev_replace_is_ongoing) {
+ btrfs_dev_replace_clear_lock_blocking(dev_replace);
+ btrfs_dev_replace_unlock(dev_replace, 0);
+ }
free_extent_map(em);
return ret;
}
@@ -5775,7 +5810,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
free_extent_map(em);
return -EIO;
}
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
length = em->len;
rmap_len = map->stripe_len;
@@ -6038,7 +6073,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
bbio->fs_info = root->fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
- if (bbio->raid_map) {
+ if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+ ((rw & WRITE) || (mirror_num > 1))) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (rw & WRITE) {
@@ -6179,6 +6215,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
struct extent_map *em;
u64 logical;
u64 length;
+ u64 stripe_len;
u64 devid;
u8 uuid[BTRFS_UUID_SIZE];
int num_stripes;
@@ -6187,6 +6224,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
+ stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ /* Validation check */
+ if (!num_stripes) {
+ btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
+ num_stripes);
+ return -EIO;
+ }
+ if (!IS_ALIGNED(logical, root->sectorsize)) {
+ btrfs_err(root->fs_info,
+ "invalid chunk logical %llu", logical);
+ return -EIO;
+ }
+ if (!length || !IS_ALIGNED(length, root->sectorsize)) {
+ btrfs_err(root->fs_info,
+ "invalid chunk length %llu", length);
+ return -EIO;
+ }
+ if (!is_power_of_2(stripe_len)) {
+ btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
+ stripe_len);
+ return -EIO;
+ }
+ if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ btrfs_chunk_type(leaf, chunk)) {
+ btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
+ ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+ btrfs_chunk_type(leaf, chunk));
+ return -EIO;
+ }
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6203,7 +6271,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
em = alloc_extent_map();
if (!em)
return -ENOMEM;
- num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
free_extent_map(em);
@@ -6211,7 +6278,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
}
set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->bdev = (struct block_device *)map;
+ em->map_lookup = map;
em->start = logical;
em->len = length;
em->orig_start = 0;
@@ -6446,11 +6513,11 @@ int btrfs_read_sys_array(struct btrfs_root *root)
sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
if (!sb)
return -ENOMEM;
- btrfs_set_buffer_uptodate(sb);
+ set_extent_buffer_uptodate(sb);
btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
/*
* The sb extent buffer is artifical and just used to read the system array.
- * btrfs_set_buffer_uptodate() call does not properly mark all it's
+ * set_extent_buffer_uptodate() call does not properly mark all it's
* pages up-to-date when the page is larger: extent does not cover the
* whole page and consequently check_page_uptodate does not find all
* the page's extents up-to-date (the hole beyond sb),
@@ -6460,7 +6527,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
* but sb spans only this function. Add an explicit SetPageUptodate call
* to silence the warning eg. on PowerPC 64.
*/
- if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
+ if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
@@ -6493,6 +6560,14 @@ int btrfs_read_sys_array(struct btrfs_root *root)
goto out_short_read;
num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ if (!num_stripes) {
+ printk(KERN_ERR
+ "BTRFS: invalid number of stripes %u in sys_array at offset %u\n",
+ num_stripes, cur_offset);
+ ret = -EIO;
+ break;
+ }
+
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
@@ -6501,6 +6576,9 @@ int btrfs_read_sys_array(struct btrfs_root *root)
if (ret)
break;
} else {
+ printk(KERN_ERR
+ "BTRFS: unexpected item type %u in sys_array at offset %u\n",
+ (u32)key.type, cur_offset);
ret = -EIO;
break;
}
@@ -6632,8 +6710,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
int item_size;
struct btrfs_dev_stats_item *ptr;
- key.objectid = 0;
- key.type = BTRFS_DEV_STATS_KEY;
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
key.offset = device->devid;
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
@@ -6680,8 +6758,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
int ret;
int i;
- key.objectid = 0;
- key.type = BTRFS_DEV_STATS_KEY;
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
key.offset = device->devid;
path = btrfs_alloc_path();
@@ -6902,7 +6980,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
/* In order to kick the device replace finish process */
lock_chunks(root);
list_for_each_entry(em, &transaction->pending_chunks, list) {
- map = (struct map_lookup *)em->bdev;
+ map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
dev = map->stripes[i].dev;
@@ -6930,7 +7008,7 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
}
}
-void btrfs_close_one_device(struct btrfs_device *device)
+static void btrfs_close_one_device(struct btrfs_device *device)
{
struct btrfs_fs_devices *fs_devices = device->fs_devices;
struct btrfs_device *new_device;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ec5712372732..1939ebde63df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -26,7 +26,7 @@
extern struct mutex uuid_mutex;
-#define BTRFS_STRIPE_LEN (64 * 1024)
+#define BTRFS_STRIPE_LEN SZ_64K
struct buffer_head;
struct btrfs_pending_bios {
@@ -382,7 +382,7 @@ struct map_lookup {
#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
-#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 8)
+#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10)
#define BTRFS_BALANCE_ARGS_MASK \
(BTRFS_BALANCE_ARGS_PROFILES | \
@@ -566,6 +566,5 @@ static inline void unlock_chunks(struct btrfs_root *root)
struct list_head *btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
-void btrfs_close_one_device(struct btrfs_device *device);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6f518c90e1c1..145d2b89e62d 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
* locks the inode's i_mutex before calling setxattr or removexattr.
*/
if (flags & XATTR_REPLACE) {
- ASSERT(mutex_is_locked(&inode->i_mutex));
+ ASSERT(inode_is_locked(inode));
di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
name, name_len, 0);
if (!di)
@@ -249,7 +249,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
goto out;
inode_inc_iversion(inode);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
@@ -260,16 +260,12 @@ out:
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
- struct btrfs_key key, found_key;
+ struct btrfs_key key;
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
- struct extent_buffer *leaf;
- struct btrfs_dir_item *di;
- int ret = 0, slot;
+ int ret = 0;
size_t total_size = 0, size_left = size;
- unsigned long name_ptr;
- size_t name_len;
/*
* ok we want all objects associated with this id.
@@ -283,7 +279,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ path->reada = READA_FORWARD;
/* search for our xattrs */
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -291,6 +287,13 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
goto err;
while (1) {
+ struct extent_buffer *leaf;
+ int slot;
+ struct btrfs_dir_item *di;
+ struct btrfs_key found_key;
+ u32 item_size;
+ u32 cur;
+
leaf = path->nodes[0];
slot = path->slots[0];
@@ -313,32 +316,48 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
- if (found_key.type != BTRFS_XATTR_ITEM_KEY)
+ if (found_key.type > BTRFS_XATTR_ITEM_KEY)
break;
+ if (found_key.type < BTRFS_XATTR_ITEM_KEY)
+ goto next_item;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
- if (verify_dir_item(root, leaf, di))
- goto next;
-
- name_len = btrfs_dir_name_len(leaf, di);
- total_size += name_len + 1;
+ item_size = btrfs_item_size_nr(leaf, slot);
+ cur = 0;
+ while (cur < item_size) {
+ u16 name_len = btrfs_dir_name_len(leaf, di);
+ u16 data_len = btrfs_dir_data_len(leaf, di);
+ u32 this_len = sizeof(*di) + name_len + data_len;
+ unsigned long name_ptr = (unsigned long)(di + 1);
+
+ if (verify_dir_item(root, leaf, di)) {
+ ret = -EIO;
+ goto err;
+ }
- /* we are just looking for how big our buffer needs to be */
- if (!size)
- goto next;
+ total_size += name_len + 1;
+ /*
+ * We are just looking for how big our buffer needs to
+ * be.
+ */
+ if (!size)
+ goto next;
- if (!buffer || (name_len + 1) > size_left) {
- ret = -ERANGE;
- goto err;
- }
+ if (!buffer || (name_len + 1) > size_left) {
+ ret = -ERANGE;
+ goto err;
+ }
- name_ptr = (unsigned long)(di + 1);
- read_extent_buffer(leaf, buffer, name_ptr, name_len);
- buffer[name_len] = '\0';
+ read_extent_buffer(leaf, buffer, name_ptr, name_len);
+ buffer[name_len] = '\0';
- size_left -= name_len + 1;
- buffer += name_len + 1;
+ size_left -= name_len + 1;
+ buffer += name_len + 1;
next:
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *)di + this_len);
+ }
+next_item:
path->slots[0]++;
}
ret = total_size;
@@ -349,137 +368,89 @@ err:
return ret;
}
-/*
- * List of handlers for synthetic system.* attributes. All real ondisk
- * attributes are handled directly.
- */
-const struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
- NULL,
-};
-
-/*
- * Check if the attribute is in a supported namespace.
- *
- * This is applied after the check for the synthetic attributes in the system
- * namespace.
- */
-static int btrfs_is_valid_xattr(const char *name)
+static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- int len = strlen(name);
- int prefixlen = 0;
-
- if (!strncmp(name, XATTR_SECURITY_PREFIX,
- XATTR_SECURITY_PREFIX_LEN))
- prefixlen = XATTR_SECURITY_PREFIX_LEN;
- else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- prefixlen = XATTR_SYSTEM_PREFIX_LEN;
- else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
- prefixlen = XATTR_TRUSTED_PREFIX_LEN;
- else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- prefixlen = XATTR_USER_PREFIX_LEN;
- else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- prefixlen = XATTR_BTRFS_PREFIX_LEN;
- else
- return -EOPNOTSUPP;
-
- /*
- * The name cannot consist of just prefix
- */
- if (len <= prefixlen)
- return -EINVAL;
+ struct inode *inode = d_inode(dentry);
- return 0;
+ name = xattr_full_name(handler, name);
+ return __btrfs_getxattr(inode, name, buffer, size);
}
-ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size,
+ int flags)
{
- int ret;
+ struct inode *inode = d_inode(dentry);
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_getxattr(dentry, name, buffer, size);
+ name = xattr_full_name(handler, name);
+ return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+}
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
- return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
+static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ name = xattr_full_name(handler, name);
+ return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
}
+static const struct xattr_handler btrfs_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_user_xattr_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_btrfs_xattr_handler = {
+ .prefix = XATTR_BTRFS_PREFIX,
+ .get = btrfs_xattr_handler_get,
+ .set = btrfs_xattr_handler_set_prop,
+};
+
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+ &btrfs_security_xattr_handler,
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &btrfs_trusted_xattr_handler,
+ &btrfs_user_xattr_handler,
+ &btrfs_btrfs_xattr_handler,
+ NULL,
+};
+
int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
- int ret;
- /*
- * The permission on security.* and system.* is not checked
- * in permission().
- */
if (btrfs_root_readonly(root))
return -EROFS;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_setxattr(dentry, name, value, size, flags);
-
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
-
- if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(d_inode(dentry), name,
- value, size, flags);
-
- if (size == 0)
- value = ""; /* empty EA, do not remove */
-
- return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
- flags);
+ return generic_setxattr(dentry, name, value, size, flags);
}
int btrfs_removexattr(struct dentry *dentry, const char *name)
{
struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
- int ret;
- /*
- * The permission on security.* and system.* is not checked
- * in permission().
- */
if (btrfs_root_readonly(root))
return -EROFS;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_removexattr(dentry, name);
-
- ret = btrfs_is_valid_xattr(name);
- if (ret)
- return ret;
-
- if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
- return btrfs_set_prop(d_inode(dentry), name,
- NULL, 0, XATTR_REPLACE);
-
- return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
- XATTR_REPLACE);
+ return generic_removexattr(dentry, name);
}
static int btrfs_initxattrs(struct inode *inode,
@@ -492,7 +463,7 @@ static int btrfs_initxattrs(struct inode *inode,
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
- strlen(xattr->name) + 1, GFP_NOFS);
+ strlen(xattr->name) + 1, GFP_KERNEL);
if (!name) {
err = -ENOMEM;
break;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5049608d1388..96807b3d22f5 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -28,8 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags);
-extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size);
extern int btrfs_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
extern int btrfs_removexattr(struct dentry *dentry, const char *name);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 82990b8f872b..88d274e8ecf2 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -59,7 +59,7 @@ static struct list_head *zlib_alloc_workspace(void)
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
workspace->strm.workspace = vmalloc(workspacesize);
- workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS);
if (!workspace->strm.workspace || !workspace->buf)
goto fail;
@@ -103,7 +103,7 @@ static int zlib_compress_pages(struct list_head *ws,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+ in_page = find_get_page(mapping, start >> PAGE_SHIFT);
data_in = kmap(in_page);
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
@@ -117,8 +117,8 @@ static int zlib_compress_pages(struct list_head *ws,
workspace->strm.next_in = data_in;
workspace->strm.next_out = cpage_out;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
- workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
+ workspace->strm.avail_out = PAGE_SIZE;
+ workspace->strm.avail_in = min(len, PAGE_SIZE);
while (workspace->strm.total_in < len) {
ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
@@ -156,7 +156,7 @@ static int zlib_compress_pages(struct list_head *ws,
cpage_out = kmap(out_page);
pages[nr_pages] = out_page;
nr_pages++;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_out = PAGE_SIZE;
workspace->strm.next_out = cpage_out;
}
/* we're all done */
@@ -170,14 +170,14 @@ static int zlib_compress_pages(struct list_head *ws,
bytes_left = len - workspace->strm.total_in;
kunmap(in_page);
- page_cache_release(in_page);
+ put_page(in_page);
- start += PAGE_CACHE_SIZE;
+ start += PAGE_SIZE;
in_page = find_get_page(mapping,
- start >> PAGE_CACHE_SHIFT);
+ start >> PAGE_SHIFT);
data_in = kmap(in_page);
workspace->strm.avail_in = min(bytes_left,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
workspace->strm.next_in = data_in;
}
}
@@ -205,7 +205,7 @@ out:
if (in_page) {
kunmap(in_page);
- page_cache_release(in_page);
+ put_page(in_page);
}
return ret;
}
@@ -223,18 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
size_t total_out = 0;
unsigned long page_in_index = 0;
unsigned long page_out_index = 0;
- unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
+ unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
unsigned long buf_start;
unsigned long pg_offset;
data_in = kmap(pages_in[page_in_index]);
workspace->strm.next_in = data_in;
- workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+ workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_out = PAGE_SIZE;
pg_offset = 0;
/* If it's deflate, and it's got no preset dictionary, then
@@ -274,7 +274,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
}
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_out = PAGE_SIZE;
if (workspace->strm.avail_in == 0) {
unsigned long tmp;
@@ -288,7 +288,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
workspace->strm.next_in = data_in;
tmp = srclen - workspace->strm.total_in;
workspace->strm.avail_in = min(tmp,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
}
}
if (ret != Z_STREAM_END)
@@ -325,7 +325,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
workspace->strm.total_in = 0;
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_out = PAGE_SIZE;
workspace->strm.total_out = 0;
/* If it's deflate, and it's got no preset dictionary, then
we can tell zlib to skip the adler32 check. */
@@ -368,8 +368,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
else
buf_offset = 0;
- bytes = min(PAGE_CACHE_SIZE - pg_offset,
- PAGE_CACHE_SIZE - buf_offset);
+ bytes = min(PAGE_SIZE - pg_offset,
+ PAGE_SIZE - buf_offset);
bytes = min(bytes, bytes_left);
kaddr = kmap_atomic(dest_page);
@@ -380,7 +380,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
bytes_left -= bytes;
next:
workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = PAGE_CACHE_SIZE;
+ workspace->strm.avail_out = PAGE_SIZE;
}
if (ret != Z_STREAM_END && bytes_left != 0)
diff --git a/fs/buffer.c b/fs/buffer.c
index 82283abb2795..af0d9a82a8ed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -129,18 +129,15 @@ __clear_page_buffers(struct page *page)
{
ClearPagePrivate(page);
set_page_private(page, 0);
- page_cache_release(page);
+ put_page(page);
}
static void buffer_io_error(struct buffer_head *bh, char *msg)
{
- char b[BDEVNAME_SIZE];
-
if (!test_bit(BH_Quiet, &bh->b_state))
printk_ratelimited(KERN_ERR
- "Buffer I/O error on dev %s, logical block %llu%s\n",
- bdevname(bh->b_bdev, b),
- (unsigned long long)bh->b_blocknr, msg);
+ "Buffer I/O error on dev %pg, logical block %llu%s\n",
+ bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
}
/*
@@ -210,7 +207,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
struct page *page;
int all_mapped = 1;
- index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+ index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
if (!page)
goto out;
@@ -237,20 +234,18 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
* elsewhere, don't buffer_error if we had some unmapped buffers
*/
if (all_mapped) {
- char b[BDEVNAME_SIZE];
-
printk("__find_get_block_slow() failed. "
"block=%llu, b_blocknr=%llu\n",
(unsigned long long)block,
(unsigned long long)bh->b_blocknr);
printk("b_state=0x%08lx, b_size=%zu\n",
bh->b_state, bh->b_size);
- printk("device %s blocksize: %d\n", bdevname(bdev, b),
+ printk("device %pg blocksize: %d\n", bdev,
1 << bd_inode->i_blkbits);
}
out_unlock:
spin_unlock(&bd_mapping->private_lock);
- page_cache_release(page);
+ put_page(page);
out:
return ret;
}
@@ -531,10 +526,8 @@ repeat:
static void do_thaw_one(struct super_block *sb, void *unused)
{
- char b[BDEVNAME_SIZE];
while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
- printk(KERN_WARNING "Emergency Thaw on %s\n",
- bdevname(sb->s_bdev, b));
+ printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
}
static void do_thaw_all(struct work_struct *work)
@@ -628,17 +621,17 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
* If warn is true, then emit a warning if the page is not uptodate and has
* not been truncated.
*
- * The caller must hold mem_cgroup_begin_page_stat() lock.
+ * The caller must hold lock_page_memcg().
*/
static void __set_page_dirty(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, int warn)
+ int warn)
{
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
@@ -673,7 +666,6 @@ static void __set_page_dirty(struct page *page, struct address_space *mapping,
int __set_page_dirty_buffers(struct page *page)
{
int newly_dirty;
- struct mem_cgroup *memcg;
struct address_space *mapping = page_mapping(page);
if (unlikely(!mapping))
@@ -690,17 +682,17 @@ int __set_page_dirty_buffers(struct page *page)
} while (bh != head);
}
/*
- * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
- * per-memcg dirty page counters.
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
*/
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, memcg, 1);
+ __set_page_dirty(page, mapping, 1);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -999,7 +991,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
int ret = 0; /* Will call free_more_memory() */
gfp_t gfp_mask;
- gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+ gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
/*
* XXX: __getblk_slow() can not really deal with failure and
@@ -1048,7 +1040,7 @@ done:
ret = (block < end_block) ? 1 : -ENXIO;
failed:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return ret;
}
@@ -1074,12 +1066,10 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
* pagecache index. (this comparison is done using sector_t types).
*/
if (unlikely(index != block >> sizebits)) {
- char b[BDEVNAME_SIZE];
-
printk(KERN_ERR "%s: requested out-of-range block %llu for "
- "device %s\n",
+ "device %pg\n",
__func__, (unsigned long long)block,
- bdevname(bdev, b));
+ bdev);
return -EIO;
}
@@ -1176,15 +1166,14 @@ void mark_buffer_dirty(struct buffer_head *bh)
if (!test_set_buffer_dirty(bh)) {
struct page *page = bh->b_page;
struct address_space *mapping = NULL;
- struct mem_cgroup *memcg;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
mapping = page_mapping(page);
if (mapping)
- __set_page_dirty(page, mapping, memcg, 0);
+ __set_page_dirty(page, mapping, 0);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (mapping)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
@@ -1544,7 +1533,7 @@ void block_invalidatepage(struct page *page, unsigned int offset,
/*
* Check for overflow
*/
- BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+ BUG_ON(stop > PAGE_SIZE || stop < length);
head = page_buffers(page);
bh = head;
@@ -1727,7 +1716,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
blocksize = bh->b_size;
bbits = block_size_bits(blocksize);
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ block = (sector_t)page->index << (PAGE_SHIFT - bbits);
last_block = (i_size_read(inode) - 1) >> bbits;
/*
@@ -1905,7 +1894,7 @@ EXPORT_SYMBOL(page_zero_new_buffers);
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
struct inode *inode = page->mapping->host;
unsigned block_start, block_end;
@@ -1915,15 +1904,15 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
BUG_ON(!PageLocked(page));
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > PAGE_SIZE);
+ BUG_ON(to > PAGE_SIZE);
BUG_ON(from > to);
head = create_page_buffers(page, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ block = (sector_t)page->index << (PAGE_SHIFT - bbits);
for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
@@ -2031,7 +2020,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
unsigned flags, struct page **pagep, get_block_t *get_block)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int status;
@@ -2042,7 +2031,7 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
status = __block_write_begin(page, pos, len, get_block);
if (unlikely(status)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
@@ -2058,7 +2047,7 @@ int block_write_end(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
unsigned start;
- start = pos & (PAGE_CACHE_SIZE - 1);
+ start = pos & (PAGE_SIZE - 1);
if (unlikely(copied < len)) {
/*
@@ -2110,7 +2099,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos);
@@ -2147,9 +2136,9 @@ int block_is_partially_uptodate(struct page *page, unsigned long from,
head = page_buffers(page);
blocksize = head->b_size;
- to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
+ to = min_t(unsigned, PAGE_SIZE - from, count);
to = from + to;
- if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
+ if (from < blocksize && to > PAGE_SIZE - blocksize)
return 0;
bh = head;
@@ -2192,7 +2181,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
- iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
lblock = (i_size_read(inode)+blocksize-1) >> bbits;
bh = head;
nr = 0;
@@ -2306,16 +2295,16 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
unsigned zerofrom, offset, len;
int err = 0;
- index = pos >> PAGE_CACHE_SHIFT;
- offset = pos & ~PAGE_CACHE_MASK;
+ index = pos >> PAGE_SHIFT;
+ offset = pos & ~PAGE_MASK;
- while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
- zerofrom = curpos & ~PAGE_CACHE_MASK;
+ while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
+ zerofrom = curpos & ~PAGE_MASK;
if (zerofrom & (blocksize-1)) {
*bytes |= (blocksize-1);
(*bytes)++;
}
- len = PAGE_CACHE_SIZE - zerofrom;
+ len = PAGE_SIZE - zerofrom;
err = pagecache_write_begin(file, mapping, curpos, len,
AOP_FLAG_UNINTERRUPTIBLE,
@@ -2340,7 +2329,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
/* page covers the boundary, find the boundary offset */
if (index == curidx) {
- zerofrom = curpos & ~PAGE_CACHE_MASK;
+ zerofrom = curpos & ~PAGE_MASK;
/* if we will expand the thing last block will be filled */
if (offset <= zerofrom) {
goto out;
@@ -2386,7 +2375,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
if (err)
return err;
- zerofrom = *bytes & ~PAGE_CACHE_MASK;
+ zerofrom = *bytes & ~PAGE_MASK;
if (pos+len > *bytes && zerofrom & (blocksize-1)) {
*bytes |= (blocksize-1);
(*bytes)++;
@@ -2420,9 +2409,9 @@ EXPORT_SYMBOL(block_commit_write);
* unlock the page.
*
* Direct callers of this function should protect against filesystem freezing
- * using sb_start_write() - sb_end_write() functions.
+ * using sb_start_pagefault() - sb_end_pagefault() functions.
*/
-int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
{
struct page *page = vmf->page;
@@ -2441,10 +2430,10 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
}
/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
- end = size & ~PAGE_CACHE_MASK;
+ if (((page->index + 1) << PAGE_SHIFT) > size)
+ end = size & ~PAGE_MASK;
else
- end = PAGE_CACHE_SIZE;
+ end = PAGE_SIZE;
ret = __block_write_begin(page, 0, end, get_block);
if (!ret)
@@ -2459,26 +2448,6 @@ out_unlock:
unlock_page(page);
return ret;
}
-EXPORT_SYMBOL(__block_page_mkwrite);
-
-int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
-{
- int ret;
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
-
- sb_start_pagefault(sb);
-
- /*
- * Update file times before taking page lock. We may end up failing the
- * fault so this update may be superfluous but who really cares...
- */
- file_update_time(vma->vm_file);
-
- ret = __block_page_mkwrite(vma, vmf, get_block);
- sb_end_pagefault(sb);
- return block_page_mkwrite_return(ret);
-}
EXPORT_SYMBOL(block_page_mkwrite);
/*
@@ -2539,8 +2508,8 @@ int nobh_write_begin(struct address_space *mapping,
int ret = 0;
int is_mapped_to_disk = 1;
- index = pos >> PAGE_CACHE_SHIFT;
- from = pos & (PAGE_CACHE_SIZE - 1);
+ index = pos >> PAGE_SHIFT;
+ from = pos & (PAGE_SIZE - 1);
to = from + len;
page = grab_cache_page_write_begin(mapping, index, flags);
@@ -2574,7 +2543,7 @@ int nobh_write_begin(struct address_space *mapping,
goto out_release;
}
- block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
/*
* We loop across all blocks in the page, whether or not they are
@@ -2582,7 +2551,7 @@ int nobh_write_begin(struct address_space *mapping,
* page is fully mapped-to-disk.
*/
for (block_start = 0, block_in_page = 0, bh = head;
- block_start < PAGE_CACHE_SIZE;
+ block_start < PAGE_SIZE;
block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
int create;
@@ -2654,7 +2623,7 @@ failed:
out_release:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
*pagep = NULL;
return ret;
@@ -2684,7 +2653,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
while (head) {
bh = head;
@@ -2706,7 +2675,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ const pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned offset;
int ret;
@@ -2715,7 +2684,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
goto out;
/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
/*
* The page may have dirty, unmapped buffers. For example,
@@ -2738,7 +2707,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
out:
ret = mpage_writepage(page, get_block, wbc);
if (ret == -EAGAIN)
@@ -2751,8 +2720,8 @@ EXPORT_SYMBOL(nobh_writepage);
int nobh_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ pgoff_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize;
sector_t iblock;
unsigned length, pos;
@@ -2769,7 +2738,7 @@ int nobh_truncate_page(struct address_space *mapping,
return 0;
length = blocksize - length;
- iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
page = grab_cache_page(mapping, index);
err = -ENOMEM;
@@ -2779,7 +2748,7 @@ int nobh_truncate_page(struct address_space *mapping,
if (page_has_buffers(page)) {
has_buffers:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return block_truncate_page(mapping, from, get_block);
}
@@ -2803,7 +2772,7 @@ has_buffers:
if (!PageUptodate(page)) {
err = mapping->a_ops->readpage(NULL, page);
if (err) {
- page_cache_release(page);
+ put_page(page);
goto out;
}
lock_page(page);
@@ -2820,7 +2789,7 @@ has_buffers:
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out:
return err;
}
@@ -2829,8 +2798,8 @@ EXPORT_SYMBOL(nobh_truncate_page);
int block_truncate_page(struct address_space *mapping,
loff_t from, get_block_t *get_block)
{
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ pgoff_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize;
sector_t iblock;
unsigned length, pos;
@@ -2847,7 +2816,7 @@ int block_truncate_page(struct address_space *mapping,
return 0;
length = blocksize - length;
- iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
page = grab_cache_page(mapping, index);
err = -ENOMEM;
@@ -2896,7 +2865,7 @@ int block_truncate_page(struct address_space *mapping,
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out:
return err;
}
@@ -2910,7 +2879,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
- const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ const pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned offset;
/* Is the page fully inside i_size? */
@@ -2919,14 +2888,14 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
end_buffer_async_write);
/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = i_size & (PAGE_SIZE-1);
if (page->index >= end_index+1 || !offset) {
/*
* The page may have dirty, unmapped buffers. For example,
* they may have been added in ext3_writepage(). Make them
* freeable here, so the page does not leak.
*/
- do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ do_invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return 0; /* don't care */
}
@@ -2938,7 +2907,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
return __block_write_full_page(inode, page, get_block, wbc,
end_buffer_async_write);
}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index f601def05bdf..1ee54ffd3a24 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -162,6 +162,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
size_t buflen, loff_t *pos)
{
struct cachefiles_cache *cache = file->private_data;
+ unsigned long long b_released;
+ unsigned f_released;
char buffer[256];
int n;
@@ -174,6 +176,8 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
cachefiles_has_space(cache, 0, 0);
/* summarise */
+ f_released = atomic_xchg(&cache->f_released, 0);
+ b_released = atomic_long_xchg(&cache->b_released, 0);
clear_bit(CACHEFILES_STATE_CHANGED, &cache->flags);
n = snprintf(buffer, sizeof(buffer),
@@ -183,15 +187,18 @@ static ssize_t cachefiles_daemon_read(struct file *file, char __user *_buffer,
" fstop=%llx"
" brun=%llx"
" bcull=%llx"
- " bstop=%llx",
+ " bstop=%llx"
+ " freleased=%x"
+ " breleased=%llx",
test_bit(CACHEFILES_CULLING, &cache->flags) ? '1' : '0',
(unsigned long long) cache->frun,
(unsigned long long) cache->fcull,
(unsigned long long) cache->fstop,
(unsigned long long) cache->brun,
(unsigned long long) cache->bcull,
- (unsigned long long) cache->bstop
- );
+ (unsigned long long) cache->bstop,
+ f_released,
+ b_released);
if (n > buflen)
return -EMSGSIZE;
@@ -226,15 +233,9 @@ static ssize_t cachefiles_daemon_write(struct file *file,
return -EOPNOTSUPP;
/* drag the command string into the kernel so we can parse it */
- data = kmalloc(datalen + 1, GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- ret = -EFAULT;
- if (copy_from_user(data, _data, datalen) != 0)
- goto error;
-
- data[datalen] = '\0';
+ data = memdup_user_nul(_data, datalen);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
ret = -EINVAL;
if (memchr(data, '\0', datalen))
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index afa023dded5b..861d611b8c05 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -291,15 +291,8 @@ static void cachefiles_drop_object(struct fscache_object *_object)
}
/* note that the object is now inactive */
- if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
- write_lock(&cache->active_lock);
- if (!test_and_clear_bit(CACHEFILES_OBJECT_ACTIVE,
- &object->flags))
- BUG();
- rb_erase(&object->active_node, &cache->active_nodes);
- wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
- write_unlock(&cache->active_lock);
- }
+ if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
+ cachefiles_mark_object_inactive(cache, object);
dput(object->dentry);
object->dentry = NULL;
@@ -446,7 +439,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
return 0;
cachefiles_begin_secure(cache, &saved_cred);
- mutex_lock(&d_inode(object->backer)->i_mutex);
+ inode_lock(d_inode(object->backer));
/* if there's an extension to a partial page at the end of the backing
* file, we need to discard the partial page so that we pick up new
@@ -465,7 +458,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
ret = notify_change(object->backer, &newattrs, NULL);
truncate_failed:
- mutex_unlock(&d_inode(object->backer)->i_mutex);
+ inode_unlock(d_inode(object->backer));
cachefiles_end_secure(cache, saved_cred);
if (ret == -EIO) {
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index aecd0859eacb..2fcde1a34b7c 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -30,7 +30,7 @@ extern unsigned cachefiles_debug;
#define CACHEFILES_DEBUG_KLEAVE 2
#define CACHEFILES_DEBUG_KDEBUG 4
-#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC)
/*
* node records
@@ -66,6 +66,8 @@ struct cachefiles_cache {
struct rb_root active_nodes; /* active nodes (can't be culled) */
rwlock_t active_lock; /* lock for active_nodes */
atomic_t gravecounter; /* graveyard uniquifier */
+ atomic_t f_released; /* number of objects released lately */
+ atomic_long_t b_released; /* number of blocks released lately */
unsigned frun_percent; /* when to stop culling (% files) */
unsigned fcull_percent; /* when to start culling (% files) */
unsigned fstop_percent; /* when to stop allocating (% files) */
@@ -157,6 +159,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
/*
* namei.c
*/
+extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
+ struct cachefiles_object *object);
extern int cachefiles_delete_object(struct cachefiles_cache *cache,
struct cachefiles_object *object);
extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index fc1056f5c96a..4ae75006e73b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -258,6 +258,28 @@ requeue:
}
/*
+ * Mark an object as being inactive.
+ */
+void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
+ struct cachefiles_object *object)
+{
+ write_lock(&cache->active_lock);
+ rb_erase(&object->active_node, &cache->active_nodes);
+ clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
+ write_unlock(&cache->active_lock);
+
+ wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
+
+ /* This object can now be culled, so we need to let the daemon know
+ * that there is something it can remove if it needs to.
+ */
+ atomic_long_add(d_backing_inode(object->dentry)->i_blocks,
+ &cache->b_released);
+ if (atomic_inc_return(&cache->f_released))
+ cachefiles_state_changed(cache);
+}
+
+/*
* delete an object representation from the cache
* - file backed objects are unlinked
* - directory backed objects are stuffed into the graveyard for userspace to
@@ -295,7 +317,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
cachefiles_mark_object_buried(cache, rep, why);
}
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
if (ret == -EIO)
cachefiles_io_error(cache, "Unlink failed");
@@ -306,7 +328,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
/* directories have to be moved to the graveyard */
_debug("move stale object to graveyard");
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
try_again:
/* first step is to make up a grave dentry in the graveyard */
@@ -423,13 +445,13 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
dir = dget_parent(object->dentry);
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) {
/* object allocation for the same key preemptively deleted this
* object's file so that it could create its own file */
_debug("object preemptively buried");
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
ret = 0;
} else {
/* we need to check that our parent is _still_ our parent - it
@@ -442,7 +464,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
/* it got moved, presumably by cachefilesd culling it,
* so it's no longer in the key path and we can ignore
* it */
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
ret = 0;
}
}
@@ -501,7 +523,7 @@ lookup_again:
/* search the current directory for the element name */
_debug("lookup '%s'", name);
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
start = jiffies;
next = lookup_one_len(name, dir, nlen);
@@ -585,7 +607,7 @@ lookup_again:
/* process the next component */
if (key) {
_debug("advance");
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(dir);
dir = next;
next = NULL;
@@ -623,7 +645,7 @@ lookup_again:
/* note that we're now using this object */
ret = cachefiles_mark_object_active(cache, object);
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(dir);
dir = NULL;
@@ -655,6 +677,8 @@ lookup_again:
aops = d_backing_inode(object->dentry)->i_mapping->a_ops;
if (!aops->bmap)
goto check_error;
+ if (object->dentry->d_sb->s_blocksize > PAGE_SIZE)
+ goto check_error;
object->backer = object->dentry;
} else {
@@ -682,11 +706,7 @@ mark_active_timed_out:
check_error:
_debug("check error %d", ret);
- write_lock(&cache->active_lock);
- rb_erase(&object->active_node, &cache->active_nodes);
- clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
- wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
- write_unlock(&cache->active_lock);
+ cachefiles_mark_object_inactive(cache, object);
release_dentry:
dput(object->dentry);
object->dentry = NULL;
@@ -703,7 +723,7 @@ lookup_error:
cachefiles_io_error(cache, "Lookup failed");
next = NULL;
error:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(next);
error_out2:
dput(dir);
@@ -727,7 +747,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
_enter(",,%s", dirname);
/* search the current directory for the element name */
- mutex_lock(&d_inode(dir)->i_mutex);
+ inode_lock(d_inode(dir));
start = jiffies;
subdir = lookup_one_len(dirname, dir, strlen(dirname));
@@ -766,7 +786,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
d_backing_inode(subdir)->i_ino);
}
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
/* we need to make sure the subdir is a directory */
ASSERT(d_backing_inode(subdir));
@@ -798,19 +818,19 @@ check_error:
return ERR_PTR(ret);
mkdir_error:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(subdir);
pr_err("mkdir %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
lookup_error:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
ret = PTR_ERR(subdir);
pr_err("Lookup %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
nomem_d_alloc:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
_leave(" = -ENOMEM");
return ERR_PTR(-ENOMEM);
}
@@ -835,7 +855,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
// dir, filename);
/* look up the victim */
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
start = jiffies;
victim = lookup_one_len(filename, dir, strlen(filename));
@@ -850,7 +870,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
* at the netfs's request whilst the cull was in progress
*/
if (d_is_negative(victim)) {
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(victim);
_leave(" = -ENOENT [absent]");
return ERR_PTR(-ENOENT);
@@ -879,13 +899,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
object_in_use:
read_unlock(&cache->active_lock);
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(victim);
//_leave(" = -EBUSY [in use]");
return ERR_PTR(-EBUSY);
lookup_error:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
ret = PTR_ERR(victim);
if (ret == -ENOENT) {
/* file or dir now absent - probably retired by netfs */
@@ -945,7 +965,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
return 0;
error_unlock:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
error:
dput(victim);
if (ret == -ENOENT) {
@@ -980,7 +1000,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
if (IS_ERR(victim))
return PTR_ERR(victim);
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(victim);
//_leave(" = 0");
return 0;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 3cbb0e834694..afbdc418966d 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -194,10 +194,10 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
error = -EIO;
}
- page_cache_release(monitor->back_page);
+ put_page(monitor->back_page);
fscache_end_io(op, monitor->netfs_page, error);
- page_cache_release(monitor->netfs_page);
+ put_page(monitor->netfs_page);
fscache_retrieval_complete(op, 1);
fscache_put_retrieval(op);
kfree(monitor);
@@ -288,8 +288,8 @@ monitor_backing_page:
_debug("- monitor add");
/* install the monitor */
- page_cache_get(monitor->netfs_page);
- page_cache_get(backpage);
+ get_page(monitor->netfs_page);
+ get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
add_page_wait_queue(backpage, &monitor->monitor);
@@ -310,7 +310,7 @@ backing_page_already_present:
_debug("- present");
if (newpage) {
- page_cache_release(newpage);
+ put_page(newpage);
newpage = NULL;
}
@@ -342,7 +342,7 @@ success:
out:
if (backpage)
- page_cache_release(backpage);
+ put_page(backpage);
if (monitor) {
fscache_put_retrieval(monitor->op);
kfree(monitor);
@@ -363,7 +363,7 @@ io_error:
goto out;
nomem_page:
- page_cache_release(newpage);
+ put_page(newpage);
nomem_monitor:
fscache_put_retrieval(monitor->op);
kfree(monitor);
@@ -414,9 +414,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
ASSERT(inode->i_mapping->a_ops->readpages);
/* calculate the shift required to use bmap */
- if (inode->i_sb->s_blocksize > PAGE_SIZE)
- goto enobufs;
-
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
@@ -533,7 +530,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
- page_cache_release(netpage);
+ put_page(netpage);
fscache_retrieval_complete(op, 1);
continue;
}
@@ -541,10 +538,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
}
/* install a monitor */
- page_cache_get(netpage);
+ get_page(netpage);
monitor->netfs_page = netpage;
- page_cache_get(backpage);
+ get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
add_page_wait_queue(backpage, &monitor->monitor);
@@ -558,10 +555,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
unlock_page(backpage);
}
- page_cache_release(backpage);
+ put_page(backpage);
backpage = NULL;
- page_cache_release(netpage);
+ put_page(netpage);
netpage = NULL;
continue;
@@ -606,7 +603,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
- page_cache_release(netpage);
+ put_page(netpage);
fscache_retrieval_complete(op, 1);
continue;
}
@@ -615,14 +612,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
copy_highpage(netpage, backpage);
- page_cache_release(backpage);
+ put_page(backpage);
backpage = NULL;
fscache_mark_page_cached(op, netpage);
/* the netpage is unlocked and marked up to date here */
fscache_end_io(op, netpage, 0);
- page_cache_release(netpage);
+ put_page(netpage);
netpage = NULL;
fscache_retrieval_complete(op, 1);
continue;
@@ -635,11 +632,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
out:
/* tidy up */
if (newpage)
- page_cache_release(newpage);
+ put_page(newpage);
if (netpage)
- page_cache_release(netpage);
+ put_page(netpage);
if (backpage)
- page_cache_release(backpage);
+ put_page(backpage);
if (monitor) {
fscache_put_retrieval(op);
kfree(monitor);
@@ -647,7 +644,7 @@ out:
list_for_each_entry_safe(netpage, _n, list, lru) {
list_del(&netpage->lru);
- page_cache_release(netpage);
+ put_page(netpage);
fscache_retrieval_complete(op, 1);
}
@@ -711,9 +708,6 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
ASSERT(inode->i_mapping->a_ops->readpages);
/* calculate the shift required to use bmap */
- if (inode->i_sb->s_blocksize > PAGE_SIZE)
- goto all_enobufs;
-
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
pagevec_init(&pagevec, 0);
@@ -885,7 +879,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
loff_t pos, eof;
size_t len;
void *data;
- int ret;
+ int ret = -ENOBUFS;
ASSERT(op != NULL);
ASSERT(page != NULL);
@@ -905,6 +899,15 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
cache = container_of(object->fscache.cache,
struct cachefiles_cache, cache);
+ pos = (loff_t)page->index << PAGE_SHIFT;
+
+ /* We mustn't write more data than we have, so we have to beware of a
+ * partial page at EOF.
+ */
+ eof = object->fscache.store_limit_l;
+ if (pos >= eof)
+ goto error;
+
/* write the page to the backing filesystem and let it store it in its
* own time */
path.mnt = cache->mnt;
@@ -912,40 +915,38 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred);
if (IS_ERR(file)) {
ret = PTR_ERR(file);
- } else {
- pos = (loff_t) page->index << PAGE_SHIFT;
-
- /* we mustn't write more data than we have, so we have
- * to beware of a partial page at EOF */
- eof = object->fscache.store_limit_l;
- len = PAGE_SIZE;
- if (eof & ~PAGE_MASK) {
- ASSERTCMP(pos, <, eof);
- if (eof - pos < PAGE_SIZE) {
- _debug("cut short %llx to %llx",
- pos, eof);
- len = eof - pos;
- ASSERTCMP(pos + len, ==, eof);
- }
- }
-
- data = kmap(page);
- ret = __kernel_write(file, data, len, &pos);
- kunmap(page);
- if (ret != len)
- ret = -EIO;
- fput(file);
+ goto error_2;
}
- if (ret < 0) {
- if (ret == -EIO)
- cachefiles_io_error_obj(
- object, "Write page to backing file failed");
- ret = -ENOBUFS;
+ len = PAGE_SIZE;
+ if (eof & ~PAGE_MASK) {
+ if (eof - pos < PAGE_SIZE) {
+ _debug("cut short %llx to %llx",
+ pos, eof);
+ len = eof - pos;
+ ASSERTCMP(pos + len, ==, eof);
+ }
}
- _leave(" = %d", ret);
- return ret;
+ data = kmap(page);
+ ret = __kernel_write(file, data, len, &pos);
+ kunmap(page);
+ fput(file);
+ if (ret != len)
+ goto error_eio;
+
+ _leave(" = 0");
+ return 0;
+
+error_eio:
+ ret = -EIO;
+error_2:
+ if (ret == -EIO)
+ cachefiles_io_error_obj(object,
+ "Write page to backing file failed");
+error:
+ _leave(" = -ENOBUFS [%d]", ret);
+ return -ENOBUFS;
}
/*
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 8f84646f10e9..f19708487e2f 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -49,10 +49,10 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -92,7 +92,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
ret = posix_acl_equiv_mode(acl, &new_mode);
if (ret < 0)
@@ -106,7 +106,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
ret = acl ? -EINVAL : 0;
goto out;
}
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
ret = -EINVAL;
@@ -202,11 +202,11 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
if (acl) {
- size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);
err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
if (err)
goto out_err;
- ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
+ ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS,
len);
err = posix_acl_to_xattr(&init_user_ns, acl,
tmp_buf, val_size1);
@@ -216,12 +216,12 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
ceph_pagelist_append(pagelist, tmp_buf, val_size1);
}
if (default_acl) {
- size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
+ size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);
err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
if (err)
goto out_err;
err = ceph_pagelist_encode_string(pagelist,
- POSIX_ACL_XATTR_DEFAULT, len);
+ XATTR_NAME_POSIX_ACL_DEFAULT, len);
err = posix_acl_to_xattr(&init_user_ns, default_acl,
tmp_buf, val_size2);
if (err < 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9d23e788d1df..4801571f51cb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,7 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
inode = page->mapping->host;
ci = ceph_inode(inode);
- if (offset != 0 || length != PAGE_CACHE_SIZE) {
+ if (offset != 0 || length != PAGE_SIZE) {
dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
inode, page, page->index, offset, length);
return;
@@ -175,8 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
static int ceph_releasepage(struct page *page, gfp_t g)
{
- struct inode *inode = page->mapping ? page->mapping->host : NULL;
- dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+ dout("%p releasepage %p idx %lu\n", page->mapping->host,
+ page, page->index);
WARN_ON(PageDirty(page));
/* Can we release the page from the cache? */
@@ -197,10 +197,10 @@ static int readpage_nounlock(struct file *filp, struct page *page)
&ceph_inode_to_client(inode)->client->osdc;
int err = 0;
u64 off = page_offset(page);
- u64 len = PAGE_CACHE_SIZE;
+ u64 len = PAGE_SIZE;
if (off >= i_size_read(inode)) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
return 0;
}
@@ -212,7 +212,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
*/
if (off == 0)
return -EINVAL;
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
return 0;
}
@@ -234,9 +234,9 @@ static int readpage_nounlock(struct file *filp, struct page *page)
ceph_fscache_readpage_cancel(inode, page);
goto out;
}
- if (err < PAGE_CACHE_SIZE)
+ if (err < PAGE_SIZE)
/* zero fill remainder of page */
- zero_user_segment(page, err, PAGE_CACHE_SIZE);
+ zero_user_segment(page, err, PAGE_SIZE);
else
flush_dcache_page(page);
@@ -276,12 +276,12 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i];
- if (rc < 0 && rc != ENOENT)
+ if (rc < 0 && rc != -ENOENT)
goto unlock;
- if (bytes < (int)PAGE_CACHE_SIZE) {
+ if (bytes < (int)PAGE_SIZE) {
/* zero (remainder of) page */
int s = bytes < 0 ? 0 : bytes;
- zero_user_segment(page, s, PAGE_CACHE_SIZE);
+ zero_user_segment(page, s, PAGE_SIZE);
}
dout("finish_read %p uptodate %p idx %lu\n", inode, page,
page->index);
@@ -290,8 +290,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
ceph_readpage_to_fscache(inode, page);
unlock:
unlock_page(page);
- page_cache_release(page);
- bytes -= PAGE_CACHE_SIZE;
+ put_page(page);
+ bytes -= PAGE_SIZE;
}
kfree(osd_data->pages);
}
@@ -336,7 +336,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
if (max && nr_pages == max)
break;
}
- len = nr_pages << PAGE_CACHE_SHIFT;
+ len = nr_pages << PAGE_SHIFT;
dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
off, len);
vino = ceph_vino(inode);
@@ -364,7 +364,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
if (add_to_page_cache_lru(page, &inode->i_data, page->index,
GFP_KERNEL)) {
ceph_fscache_uncache_page(inode, page);
- page_cache_release(page);
+ put_page(page);
dout("start_read %p add_to_page_cache failed %p\n",
inode, page);
nr_pages = i;
@@ -415,8 +415,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
if (rc == 0)
goto out;
- if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
- max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+ if (fsc->mount_options->rsize >= PAGE_SIZE)
+ max = (fsc->mount_options->rsize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
dout("readpages %p file %p nr_pages %d max %d\n", inode,
@@ -484,7 +484,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
long writeback_stat;
u64 truncate_size;
u32 truncate_seq;
- int err = 0, len = PAGE_CACHE_SIZE;
+ int err = 0, len = PAGE_SIZE;
dout("writepage %p idx %lu\n", page, page->index);
@@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_data *osd_data;
- unsigned wrote;
struct page *page;
- int num_pages;
- int i;
+ int num_pages, total_pages = 0;
+ int i, j;
+ int rc = req->r_result;
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
- int rc = req->r_result;
- u64 bytes = req->r_ops[0].extent.length;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- long writeback_stat;
- unsigned issued = ceph_caps_issued(ci);
+ bool remove_page;
- osd_data = osd_req_op_extent_osd_data(req, 0);
- BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
- num_pages = calc_pages_for((u64)osd_data->alignment,
- (u64)osd_data->length);
- if (rc >= 0) {
- /*
- * Assume we wrote the pages we originally sent. The
- * osd might reply with fewer pages if our writeback
- * raced with a truncation and was adjusted at the osd,
- * so don't believe the reply.
- */
- wrote = num_pages;
- } else {
- wrote = 0;
+
+ dout("writepages_finish %p rc %d\n", inode, rc);
+ if (rc < 0)
mapping_set_error(mapping, rc);
- }
- dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
- inode, rc, bytes, wrote);
- /* clean all pages */
- for (i = 0; i < num_pages; i++) {
- page = osd_data->pages[i];
- BUG_ON(!page);
- WARN_ON(!PageUptodate(page));
+ /*
+ * We lost the cache cap, need to truncate the page before
+ * it is unlocked, otherwise we'd truncate it later in the
+ * page truncation thread, possibly losing some data that
+ * raced its way in
+ */
+ remove_page = !(ceph_caps_issued(ci) &
+ (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
- writeback_stat =
- atomic_long_dec_return(&fsc->writeback_count);
- if (writeback_stat <
- CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
- clear_bdi_congested(&fsc->backing_dev_info,
- BLK_RW_ASYNC);
+ /* clean all pages */
+ for (i = 0; i < req->r_num_ops; i++) {
+ if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+ break;
- ceph_put_snap_context(page_snap_context(page));
- page->private = 0;
- ClearPagePrivate(page);
- dout("unlocking %d %p\n", i, page);
- end_page_writeback(page);
+ osd_data = osd_req_op_extent_osd_data(req, i);
+ BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
+ num_pages = calc_pages_for((u64)osd_data->alignment,
+ (u64)osd_data->length);
+ total_pages += num_pages;
+ for (j = 0; j < num_pages; j++) {
+ page = osd_data->pages[j];
+ BUG_ON(!page);
+ WARN_ON(!PageUptodate(page));
+
+ if (atomic_long_dec_return(&fsc->writeback_count) <
+ CONGESTION_OFF_THRESH(
+ fsc->mount_options->congestion_kb))
+ clear_bdi_congested(&fsc->backing_dev_info,
+ BLK_RW_ASYNC);
+
+ ceph_put_snap_context(page_snap_context(page));
+ page->private = 0;
+ ClearPagePrivate(page);
+ dout("unlocking %p\n", page);
+ end_page_writeback(page);
+
+ if (remove_page)
+ generic_error_remove_page(inode->i_mapping,
+ page);
- /*
- * We lost the cache cap, need to truncate the page before
- * it is unlocked, otherwise we'd truncate it later in the
- * page truncation thread, possibly losing some data that
- * raced its way in
- */
- if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
- generic_error_remove_page(inode->i_mapping, page);
+ unlock_page(page);
+ }
+ dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
+ inode, osd_data->length, rc >= 0 ? num_pages : 0);
- unlock_page(page);
+ ceph_release_pages(osd_data->pages, num_pages);
}
- dout("%p wrote+cleaned %d pages\n", inode, wrote);
- ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
- ceph_release_pages(osd_data->pages, num_pages);
+ ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
+
+ osd_data = osd_req_op_extent_osd_data(req, 0);
if (osd_data->pages_from_pool)
mempool_free(osd_data->pages,
ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
@@ -725,9 +725,9 @@ static int ceph_writepages_start(struct address_space *mapping,
}
if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
wsize = fsc->mount_options->wsize;
- if (wsize < PAGE_CACHE_SIZE)
- wsize = PAGE_CACHE_SIZE;
- max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+ if (wsize < PAGE_SIZE)
+ wsize = PAGE_SIZE;
+ max_pages_ever = wsize >> PAGE_SHIFT;
pagevec_init(&pvec, 0);
@@ -737,8 +737,8 @@ static int ceph_writepages_start(struct address_space *mapping,
end = -1;
dout(" cyclic, start at %lu\n", start);
} else {
- start = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ start = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
should_loop = 0;
@@ -778,17 +778,15 @@ retry:
while (!done && index <= end) {
unsigned i;
int first;
- pgoff_t next;
- int pvec_pages, locked_pages;
- struct page **pages = NULL;
+ pgoff_t strip_unit_end = 0;
+ int num_ops = 0, op_idx;
+ int pvec_pages, locked_pages = 0;
+ struct page **pages = NULL, **data_pages;
mempool_t *pool = NULL; /* Becomes non-null if mempool used */
struct page *page;
int want;
- u64 offset, len;
- long writeback_stat;
+ u64 offset = 0, len = 0;
- next = 0;
- locked_pages = 0;
max_pages = max_pages_ever;
get_more_pages:
@@ -824,8 +822,8 @@ get_more_pages:
unlock_page(page);
break;
}
- if (next && (page->index != next)) {
- dout("not consecutive %p\n", page);
+ if (strip_unit_end && (page->index > strip_unit_end)) {
+ dout("end of strip unit %p\n", page);
unlock_page(page);
break;
}
@@ -867,36 +865,31 @@ get_more_pages:
/*
* We have something to write. If this is
* the first locked page this time through,
- * allocate an osd request and a page array
- * that it will use.
+ * calculate max possinle write size and
+ * allocate a page array
*/
if (locked_pages == 0) {
- BUG_ON(pages);
+ u64 objnum;
+ u64 objoff;
+
/* prepare async write request */
offset = (u64)page_offset(page);
len = wsize;
- req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout, vino,
- offset, &len, 0,
- do_sync ? 2 : 1,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE |
- CEPH_OSD_FLAG_ONDISK,
- snapc, truncate_seq,
- truncate_size, true);
- if (IS_ERR(req)) {
- rc = PTR_ERR(req);
+
+ rc = ceph_calc_file_object_mapping(&ci->i_layout,
+ offset, len,
+ &objnum, &objoff,
+ &len);
+ if (rc < 0) {
unlock_page(page);
break;
}
- if (do_sync)
- osd_req_op_init(req, 1,
- CEPH_OSD_OP_STARTSYNC, 0);
-
- req->r_callback = writepages_finish;
- req->r_inode = inode;
+ num_ops = 1 + do_sync;
+ strip_unit_end = page->index +
+ ((len - 1) >> PAGE_SHIFT);
+ BUG_ON(pages);
max_pages = calc_pages_for(0, (u64)len);
pages = kmalloc(max_pages * sizeof (*pages),
GFP_NOFS);
@@ -905,6 +898,20 @@ get_more_pages:
pages = mempool_alloc(pool, GFP_NOFS);
BUG_ON(!pages);
}
+
+ len = 0;
+ } else if (page->index !=
+ (offset + len) >> PAGE_SHIFT) {
+ if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS :
+ CEPH_OSD_MAX_OPS)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ break;
+ }
+
+ num_ops++;
+ offset = (u64)page_offset(page);
+ len = 0;
}
/* note position of first page in pvec */
@@ -913,18 +920,16 @@ get_more_pages:
dout("%p will write page %p idx %lu\n",
inode, page, page->index);
- writeback_stat =
- atomic_long_inc_return(&fsc->writeback_count);
- if (writeback_stat > CONGESTION_ON_THRESH(
+ if (atomic_long_inc_return(&fsc->writeback_count) >
+ CONGESTION_ON_THRESH(
fsc->mount_options->congestion_kb)) {
set_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC);
}
- set_page_writeback(page);
pages[locked_pages] = page;
locked_pages++;
- next = page->index + 1;
+ len += PAGE_SIZE;
}
/* did we get anything? */
@@ -944,38 +949,119 @@ get_more_pages:
/* shift unused pages over in the pvec... we
* will need to release them below. */
for (j = i; j < pvec_pages; j++) {
- dout(" pvec leftover page %p\n",
- pvec.pages[j]);
+ dout(" pvec leftover page %p\n", pvec.pages[j]);
pvec.pages[j-i+first] = pvec.pages[j];
}
pvec.nr -= i-first;
}
- /* Format the osd request message and submit the write */
+new_request:
offset = page_offset(pages[0]);
- len = (u64)locked_pages << PAGE_CACHE_SHIFT;
- if (snap_size == -1) {
- len = min(len, (u64)i_size_read(inode) - offset);
- /* writepages_finish() clears writeback pages
- * according to the data length, so make sure
- * data length covers all locked pages */
- len = max(len, 1 +
- ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
- } else {
- len = min(len, snap_size - offset);
+ len = wsize;
+
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0, num_ops,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ snapc, truncate_seq,
+ truncate_size, false);
+ if (IS_ERR(req)) {
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0,
+ min(num_ops,
+ CEPH_OSD_SLAB_OPS),
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ snapc, truncate_seq,
+ truncate_size, true);
+ BUG_ON(IS_ERR(req));
}
- dout("writepages got %d pages at %llu~%llu\n",
- locked_pages, offset, len);
+ BUG_ON(len < page_offset(pages[locked_pages - 1]) +
+ PAGE_SIZE - offset);
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+ req->r_callback = writepages_finish;
+ req->r_inode = inode;
+
+ /* Format the osd request message and submit the write */
+ len = 0;
+ data_pages = pages;
+ op_idx = 0;
+ for (i = 0; i < locked_pages; i++) {
+ u64 cur_offset = page_offset(pages[i]);
+ if (offset + len != cur_offset) {
+ if (op_idx + do_sync + 1 == req->r_num_ops)
+ break;
+ osd_req_op_extent_dup_last(req, op_idx,
+ cur_offset - offset);
+ dout("writepages got pages at %llu~%llu\n",
+ offset, len);
+ osd_req_op_extent_osd_data_pages(req, op_idx,
+ data_pages, len, 0,
!!pool, false);
+ osd_req_op_extent_update(req, op_idx, len);
- pages = NULL; /* request message now owns the pages array */
- pool = NULL;
+ len = 0;
+ offset = cur_offset;
+ data_pages = pages + i;
+ op_idx++;
+ }
+
+ set_page_writeback(pages[i]);
+ len += PAGE_SIZE;
+ }
+
+ if (snap_size != -1) {
+ len = min(len, snap_size - offset);
+ } else if (i == locked_pages) {
+ /* writepages_finish() clears writeback pages
+ * according to the data length, so make sure
+ * data length covers all locked pages */
+ u64 min_len = len + 1 - PAGE_SIZE;
+ len = min(len, (u64)i_size_read(inode) - offset);
+ len = max(len, min_len);
+ }
+ dout("writepages got pages at %llu~%llu\n", offset, len);
- /* Update the write op length in case we changed it */
+ osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
+ 0, !!pool, false);
+ osd_req_op_extent_update(req, op_idx, len);
+
+ if (do_sync) {
+ op_idx++;
+ osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0);
+ }
+ BUG_ON(op_idx + 1 != req->r_num_ops);
- osd_req_op_extent_update(req, 0, len);
+ pool = NULL;
+ if (i < locked_pages) {
+ BUG_ON(num_ops <= req->r_num_ops);
+ num_ops -= req->r_num_ops;
+ num_ops += do_sync;
+ locked_pages -= i;
+
+ /* allocate new pages array for next request */
+ data_pages = pages;
+ pages = kmalloc(locked_pages * sizeof (*pages),
+ GFP_NOFS);
+ if (!pages) {
+ pool = fsc->wb_pagevec_pool;
+ pages = mempool_alloc(pool, GFP_NOFS);
+ BUG_ON(!pages);
+ }
+ memcpy(pages, data_pages + i,
+ locked_pages * sizeof(*pages));
+ memset(data_pages + i, 0,
+ locked_pages * sizeof(*pages));
+ } else {
+ BUG_ON(num_ops != req->r_num_ops);
+ index = pages[i - 1]->index + 1;
+ /* request message now owns the pages array */
+ pages = NULL;
+ }
vino = ceph_vino(inode);
ceph_osdc_build_request(req, offset, snapc, vino.snap,
@@ -985,9 +1071,10 @@ get_more_pages:
BUG_ON(rc);
req = NULL;
- /* continue? */
- index = next;
- wbc->nr_to_write -= locked_pages;
+ wbc->nr_to_write -= i;
+ if (pages)
+ goto new_request;
+
if (wbc->nr_to_write <= 0)
done = 1;
@@ -1048,8 +1135,8 @@ static int ceph_update_writeable_page(struct file *file,
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
- loff_t page_off = pos & PAGE_CACHE_MASK;
- int pos_in_page = pos & ~PAGE_CACHE_MASK;
+ loff_t page_off = pos & PAGE_MASK;
+ int pos_in_page = pos & ~PAGE_MASK;
int end_in_page = pos_in_page + len;
loff_t i_size;
int r;
@@ -1104,20 +1191,20 @@ retry_locked:
}
/* full page? */
- if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+ if (pos_in_page == 0 && len == PAGE_SIZE)
return 0;
/* past end of file? */
- i_size = inode->i_size; /* caller holds i_mutex */
+ i_size = i_size_read(inode);
if (page_off >= i_size ||
(pos_in_page == 0 && (pos+len) >= i_size &&
- end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+ end_in_page - pos_in_page != PAGE_SIZE)) {
dout(" zeroing %p 0 - %d and %d - %d\n",
- page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+ page, pos_in_page, end_in_page, (int)PAGE_SIZE);
zero_user_segments(page,
0, pos_in_page,
- end_in_page, PAGE_CACHE_SIZE);
+ end_in_page, PAGE_SIZE);
return 0;
}
@@ -1141,7 +1228,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = file_inode(file);
struct page *page;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
int r;
do {
@@ -1149,14 +1236,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
page = grab_cache_page_write_begin(mapping, index, 0);
if (!page)
return -ENOMEM;
- *pagep = page;
dout("write_begin file %p inode %p page %p %d~%d\n", file,
inode, page, (int)pos, (int)len);
r = ceph_update_writeable_page(file, pos, len, page);
if (r < 0)
- page_cache_release(page);
+ put_page(page);
else
*pagep = page;
} while (r == -EAGAIN);
@@ -1173,7 +1259,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file_inode(file);
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
int check_cap = 0;
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
@@ -1184,8 +1270,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
zero_user_segment(page, from+copied, len);
/* did file size increase? */
- /* (no need for i_size_read(); we caller holds i_mutex */
- if (pos+copied > inode->i_size)
+ if (pos+copied > i_size_read(inode))
check_cap = ceph_inode_set_size(inode, pos+copied);
if (!PageUptodate(page))
@@ -1194,7 +1279,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (check_cap)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
@@ -1237,11 +1322,11 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct page *pinned_page = NULL;
- loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
+ loff_t off = vmf->pgoff << PAGE_SHIFT;
int want, got, ret;
dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
- inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
+ inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
@@ -1258,7 +1343,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
}
dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
- inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
+ inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
ci->i_inline_version == CEPH_INLINE_NONE)
@@ -1267,24 +1352,24 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = -EAGAIN;
dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
- inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+ inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got), ret);
if (pinned_page)
- page_cache_release(pinned_page);
+ put_page(pinned_page);
ceph_put_cap_refs(ci, got);
if (ret != -EAGAIN)
return ret;
/* read inline data */
- if (off >= PAGE_CACHE_SIZE) {
+ if (off >= PAGE_SIZE) {
/* does not support inline data > PAGE_SIZE */
ret = VM_FAULT_SIGBUS;
} else {
int ret1;
struct address_space *mapping = inode->i_mapping;
struct page *page = find_or_create_page(mapping, 0,
- mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page) {
ret = VM_FAULT_OOM;
goto out;
@@ -1293,12 +1378,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
CEPH_STAT_CAP_INLINE_DATA, true);
if (ret1 < 0 || off >= i_size_read(inode)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ret = VM_FAULT_SIGBUS;
goto out;
}
- if (ret1 < PAGE_CACHE_SIZE)
- zero_user_segment(page, ret1, PAGE_CACHE_SIZE);
+ if (ret1 < PAGE_SIZE)
+ zero_user_segment(page, ret1, PAGE_SIZE);
else
flush_dcache_page(page);
SetPageUptodate(page);
@@ -1307,7 +1392,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
out:
dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
- inode, off, (size_t)PAGE_CACHE_SIZE, ret);
+ inode, off, (size_t)PAGE_SIZE, ret);
return ret;
}
@@ -1345,10 +1430,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
}
}
- if (off + PAGE_CACHE_SIZE <= size)
- len = PAGE_CACHE_SIZE;
+ if (off + PAGE_SIZE <= size)
+ len = PAGE_SIZE;
else
- len = size & ~PAGE_CACHE_MASK;
+ len = size & ~PAGE_MASK;
dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
inode, ceph_vinop(inode), off, len, size);
@@ -1378,11 +1463,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
if ((off > size) ||
- (page->mapping != inode->i_mapping))
+ (page->mapping != inode->i_mapping)) {
+ unlock_page(page);
goto out;
+ }
ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
- if (ret == 0) {
+ if (ret >= 0) {
/* success. we'll keep the page locked. */
set_page_dirty(page);
ret = VM_FAULT_LOCKED;
@@ -1393,8 +1480,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
}
out:
- if (ret != VM_FAULT_LOCKED)
- unlock_page(page);
if (ret == VM_FAULT_LOCKED ||
ci->i_inline_version != CEPH_INLINE_NONE) {
int dirty;
@@ -1428,12 +1513,13 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
if (i_size_read(inode) == 0)
return;
page = find_or_create_page(mapping, 0,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
return;
if (PageUptodate(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return;
}
}
@@ -1448,14 +1534,14 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
if (page != locked_page) {
- if (len < PAGE_CACHE_SIZE)
- zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
else
flush_dcache_page(page);
SetPageUptodate(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
@@ -1492,7 +1578,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
from_pagecache = true;
lock_page(page);
} else {
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
}
@@ -1500,8 +1586,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
if (page) {
len = i_size_read(inode);
- if (len > PAGE_CACHE_SIZE)
- len = PAGE_CACHE_SIZE;
+ if (len > PAGE_SIZE)
+ len = PAGE_SIZE;
} else {
page = __page_cache_alloc(GFP_NOFS);
if (!page) {
@@ -1523,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_vino(inode), 0, &len, 0, 1,
CEPH_OSD_OP_CREATE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
- ceph_empty_snapc, 0, 0, false);
+ NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
@@ -1541,9 +1627,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_vino(inode), 0, &len, 1, 3,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
- ceph_empty_snapc,
- ci->i_truncate_seq, ci->i_truncate_size,
- false);
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
@@ -1585,7 +1670,7 @@ out:
if (page && page != locked_page) {
if (from_pagecache) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
} else
__free_pages(page, 0);
}
@@ -1664,8 +1749,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out;
}
- rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
- ceph_empty_snapc,
+ rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1, false, GFP_NOFS);
if (!rd_req) {
err = -ENOMEM;
@@ -1679,8 +1763,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
"%llx.00000000", ci->i_vino.ino);
rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
- wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
- ceph_empty_snapc,
+ wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1, false, GFP_NOFS);
if (!wr_req) {
err = -ENOMEM;
@@ -1757,6 +1840,10 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
u32 pool;
int ret, flags;
+ /* does not support pool namespace yet */
+ if (ci->i_pool_ns_len)
+ return -EIO;
+
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
NOPOOLPERM))
return 0;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 834f9f3723fb..a351480dbabc 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -88,7 +88,7 @@ static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
const struct ceph_inode_info* ci = cookie_netfs_data;
uint16_t klen;
- /* use ceph virtual inode (id + snaphot) */
+ /* use ceph virtual inode (id + snapshot) */
klen = sizeof(ci->i_vino);
if (klen > maxbuf)
return 0;
@@ -106,7 +106,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
memset(&aux, 0, sizeof(aux));
aux.mtime = inode->i_mtime;
- aux.size = inode->i_size;
+ aux.size = i_size_read(inode);
memcpy(buffer, &aux, sizeof(aux));
@@ -117,9 +117,7 @@ static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
uint64_t *size)
{
const struct ceph_inode_info* ci = cookie_netfs_data;
- const struct inode* inode = &ci->vfs_inode;
-
- *size = inode->i_size;
+ *size = i_size_read(&ci->vfs_inode);
}
static enum fscache_checkaux ceph_fscache_inode_check_aux(
@@ -134,7 +132,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
memset(&aux, 0, sizeof(aux));
aux.mtime = inode->i_mtime;
- aux.size = inode->i_size;
+ aux.size = i_size_read(inode);
if (memcmp(data, &aux, sizeof(aux)) != 0)
return FSCACHE_CHECKAUX_OBSOLETE;
@@ -197,7 +195,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
return;
/* Avoid multiple racing open requests */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (ci->fscache)
goto done;
@@ -207,7 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
ci, true);
fscache_check_consistency(ci->fscache);
done:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 27b566874bc1..cfaeef18cbca 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -991,7 +991,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u32 seq, u64 flush_tid, u64 oldest_flush_tid,
u32 issue_seq, u32 mseq, u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
- u64 time_warp_seq,
+ struct timespec *ctime, u64 time_warp_seq,
kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
@@ -1042,6 +1042,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
ceph_encode_timespec(&fc->mtime, mtime);
if (atime)
ceph_encode_timespec(&fc->atime, atime);
+ if (ctime)
+ ceph_encode_timespec(&fc->ctime, ctime);
fc->time_warp_seq = cpu_to_le32(time_warp_seq);
fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
@@ -1116,7 +1118,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
int held, revoking, dropping, keep;
u64 seq, issue_seq, mseq, time_warp_seq, follows;
u64 size, max_size;
- struct timespec mtime, atime;
+ struct timespec mtime, atime, ctime;
int wake = 0;
umode_t mode;
kuid_t uid;
@@ -1180,6 +1182,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ci->i_requested_max_size = max_size;
mtime = inode->i_mtime;
atime = inode->i_atime;
+ ctime = inode->i_ctime;
time_warp_seq = ci->i_time_warp_seq;
uid = inode->i_uid;
gid = inode->i_gid;
@@ -1198,7 +1201,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
op, keep, want, flushing, seq,
flush_tid, oldest_flush_tid, issue_seq, mseq,
- size, max_size, &mtime, &atime, time_warp_seq,
+ size, max_size, &mtime, &atime, &ctime, time_warp_seq,
uid, gid, mode, xattr_version, xattr_blob,
follows, inline_data);
if (ret < 0) {
@@ -1320,7 +1323,7 @@ retry:
capsnap->dirty, 0, capsnap->flush_tid, 0,
0, mseq, capsnap->size, 0,
&capsnap->mtime, &capsnap->atime,
- capsnap->time_warp_seq,
+ &capsnap->ctime, capsnap->time_warp_seq,
capsnap->uid, capsnap->gid, capsnap->mode,
capsnap->xattr_version, capsnap->xattr_blob,
capsnap->follows, capsnap->inline_data);
@@ -1655,9 +1658,8 @@ retry_locked:
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
- (file_wanted == 0 || /* no open files */
- (revoking & (CEPH_CAP_FILE_CACHE|
- CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
+ (revoking & (CEPH_CAP_FILE_CACHE|
+ CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
dout("check_caps trying to invalidate on %p\n", inode);
if (try_nonblocking_invalidate(inode) < 0) {
@@ -1971,49 +1973,46 @@ out:
}
/*
- * wait for any uncommitted directory operations to commit.
+ * wait for any unsafe requests to complete.
*/
-static int unsafe_dirop_wait(struct inode *inode)
+static int unsafe_request_wait(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_dirops;
- struct ceph_mds_request *req;
- u64 last_tid;
- int ret = 0;
-
- if (!S_ISDIR(inode->i_mode))
- return 0;
+ struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+ int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- req = list_last_entry(head, struct ceph_mds_request,
- r_unsafe_dir_item);
- last_tid = req->r_tid;
-
- do {
- ceph_mdsc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
+ if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
+ req1 = list_last_entry(&ci->i_unsafe_dirops,
+ struct ceph_mds_request,
+ r_unsafe_dir_item);
+ ceph_mdsc_get_request(req1);
+ }
+ if (!list_empty(&ci->i_unsafe_iops)) {
+ req2 = list_last_entry(&ci->i_unsafe_iops,
+ struct ceph_mds_request,
+ r_unsafe_target_item);
+ ceph_mdsc_get_request(req2);
+ }
+ spin_unlock(&ci->i_unsafe_lock);
- dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
- inode, req->r_tid, last_tid);
- ret = !wait_for_completion_timeout(&req->r_safe_completion,
- ceph_timeout_jiffies(req->r_timeout));
+ dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
+ inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
+ if (req1) {
+ ret = !wait_for_completion_timeout(&req1->r_safe_completion,
+ ceph_timeout_jiffies(req1->r_timeout));
if (ret)
- ret = -EIO; /* timed out */
-
- ceph_mdsc_put_request(req);
-
- spin_lock(&ci->i_unsafe_lock);
- if (ret || list_empty(head))
- break;
- req = list_first_entry(head, struct ceph_mds_request,
- r_unsafe_dir_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
- return ret;
+ err = -EIO;
+ ceph_mdsc_put_request(req1);
+ }
+ if (req2) {
+ ret = !wait_for_completion_timeout(&req2->r_safe_completion,
+ ceph_timeout_jiffies(req2->r_timeout));
+ if (ret)
+ err = -EIO;
+ ceph_mdsc_put_request(req2);
+ }
+ return err;
}
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -2034,12 +2033,12 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (datasync)
goto out;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
- ret = unsafe_dirop_wait(inode);
+ ret = unsafe_request_wait(inode);
/*
* only wait on non-file metadata writeback (the mds
@@ -2050,7 +2049,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
ret = wait_event_interruptible(ci->i_cap_wq,
caps_are_flushed(inode, flush_tid));
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out:
dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
return ret;
@@ -2511,7 +2510,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
*pinned_page = page;
break;
}
- page_cache_release(page);
+ put_page(page);
}
/*
* drop cap refs first because getattr while
@@ -2757,7 +2756,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
void *inline_data, int inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap, int issued)
+ struct ceph_cap *cap, int issued,
+ u32 pool_ns_len)
__releases(ci->i_ceph_lock)
__releases(mdsc->snap_rwsem)
{
@@ -2877,6 +2877,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
/* file layout may have changed */
ci->i_layout = grant->layout;
+ ci->i_pool_ns_len = pool_ns_len;
+
/* size/truncate_seq? */
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(grant->truncate_seq),
@@ -3415,6 +3417,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u32 inline_len = 0;
void *snaptrace;
size_t snaptrace_len;
+ u32 pool_ns_len = 0;
void *p, *end;
dout("handle_caps from mds%d\n", mds);
@@ -3467,6 +3470,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
p += inline_len;
}
+ if (le16_to_cpu(msg->hdr.version) >= 8) {
+ u64 flush_tid;
+ u32 caller_uid, caller_gid;
+ u32 osd_epoch_barrier;
+ /* version >= 5 */
+ ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+ /* version >= 6 */
+ ceph_decode_64_safe(&p, end, flush_tid, bad);
+ /* version >= 7 */
+ ceph_decode_32_safe(&p, end, caller_uid, bad);
+ ceph_decode_32_safe(&p, end, caller_gid, bad);
+ /* version >= 8 */
+ ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+ }
+
/* lookup ino */
inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode);
@@ -3522,7 +3540,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
&cap, &issued);
handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued);
+ msg->middle, session, cap, issued,
+ pool_ns_len);
if (realm)
ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
@@ -3546,7 +3565,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
issued |= __ceph_caps_dirty(ci);
handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued);
+ msg->middle, session, cap, issued,
+ pool_ns_len);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 9314b4ea2375..4fb2bbc2a272 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
if (dentry->d_fsdata)
return 0;
- di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO);
+ di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
if (!di)
return -ENOMEM; /* oh well */
@@ -68,23 +68,6 @@ out_unlock:
return 0;
}
-struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
-{
- struct inode *inode = NULL;
-
- if (!dentry)
- return NULL;
-
- spin_lock(&dentry->d_lock);
- if (!IS_ROOT(dentry)) {
- inode = d_inode(dentry->d_parent);
- ihold(inode);
- }
- spin_unlock(&dentry->d_lock);
- return inode;
-}
-
-
/*
* for readdir, we encode the directory frag and offset within that
* frag into f_pos.
@@ -146,7 +129,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
struct inode *dir = d_inode(parent);
struct dentry *dentry, *last = NULL;
struct ceph_dentry_info *di;
- unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *);
+ unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
int err = 0;
loff_t ptr_pos = 0;
struct ceph_readdir_cache_control cache_ctl = {};
@@ -171,7 +154,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
}
err = -EAGAIN;
- pgoff = ptr_pos >> PAGE_CACHE_SHIFT;
+ pgoff = ptr_pos >> PAGE_SHIFT;
if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
ceph_readdir_cache_release(&cache_ctl);
cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
@@ -507,7 +490,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
loff_t retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = -EINVAL;
switch (whence) {
case SEEK_CUR:
@@ -542,7 +525,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
}
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return retval;
}
@@ -624,6 +607,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int op;
+ int mask;
int err;
dout("lookup %p dentry %p '%pd'\n",
@@ -666,8 +650,12 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
return ERR_CAST(req);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- /* we only need inode linkage */
- req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
req->r_locked_dir = dir;
err = ceph_mdsc_do_request(mdsc, NULL, req);
err = ceph_handle_snapdir(req, dentry, err);
@@ -1095,6 +1083,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
{
int valid = 0;
+ struct dentry *parent;
struct inode *dir;
if (flags & LOOKUP_RCU)
@@ -1103,7 +1092,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
- dir = ceph_get_dentry_parent_inode(dentry);
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1121,13 +1111,48 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
valid = 1;
}
+ if (!valid) {
+ struct ceph_mds_client *mdsc =
+ ceph_sb_to_client(dir->i_sb)->mdsc;
+ struct ceph_mds_request *req;
+ int op, mask, err;
+
+ op = ceph_snap(dir) == CEPH_SNAPDIR ?
+ CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+ req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+ if (!IS_ERR(req)) {
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = mask;
+
+ req->r_locked_dir = dir;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err == 0 || err == -ENOENT) {
+ if (dentry == req->r_dentry) {
+ valid = !d_unhashed(dentry);
+ } else {
+ d_invalidate(req->r_dentry);
+ err = -EAGAIN;
+ }
+ }
+ ceph_mdsc_put_request(req);
+ dout("d_revalidate %p lookup result=%d\n",
+ dentry, err);
+ }
+ }
+
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
if (valid) {
ceph_dentry_lru_touch(dentry);
} else {
ceph_dir_clear_complete(dir);
}
- iput(dir);
+
+ dput(parent);
return valid;
}
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fe02ae7f056a..6e72c98162d5 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -71,12 +71,18 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
inode = ceph_find_inode(sb, vino);
if (!inode) {
struct ceph_mds_request *req;
+ int mask;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
USE_ANY_MDS);
if (IS_ERR(req))
return ERR_CAST(req);
+ mask = CEPH_STAT_CAP_INODE;
+ if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
req->r_ino1 = vino;
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -128,6 +134,7 @@ static struct dentry *__get_parent(struct super_block *sb,
struct ceph_mds_request *req;
struct inode *inode;
struct dentry *dentry;
+ int mask;
int err;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
@@ -144,6 +151,12 @@ static struct dentry *__get_parent(struct super_block *sb,
.snap = CEPH_NOSNAP,
};
}
+
+ mask = CEPH_STAT_CAP_INODE;
+ if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.getattr.mask = cpu_to_le32(mask);
+
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
inode = req->r_target_inode;
@@ -215,7 +228,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
if (IS_ERR(req))
return PTR_ERR(req);
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
req->r_inode = d_inode(child);
ihold(d_inode(child));
@@ -224,7 +237,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
if (!err) {
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0c62868b5c56..a79f9269831e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -34,6 +34,74 @@
* need to wait for MDS acknowledgement.
*/
+/*
+ * Calculate the length sum of direct io vectors that can
+ * be combined into one page vector.
+ */
+static size_t dio_get_pagev_size(const struct iov_iter *it)
+{
+ const struct iovec *iov = it->iov;
+ const struct iovec *iovend = iov + it->nr_segs;
+ size_t size;
+
+ size = iov->iov_len - it->iov_offset;
+ /*
+ * An iov can be page vectored when both the current tail
+ * and the next base are page aligned.
+ */
+ while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
+ (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
+ size += iov->iov_len;
+ }
+ dout("dio_get_pagevlen len = %zu\n", size);
+ return size;
+}
+
+/*
+ * Allocate a page vector based on (@it, @nbytes).
+ * The return value is the tuple describing a page vector,
+ * that is (@pages, @page_align, @num_pages).
+ */
+static struct page **
+dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
+ size_t *page_align, int *num_pages)
+{
+ struct iov_iter tmp_it = *it;
+ size_t align;
+ struct page **pages;
+ int ret = 0, idx, npages;
+
+ align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
+ (PAGE_SIZE - 1);
+ npages = calc_pages_for(align, nbytes);
+ pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
+ if (!pages) {
+ pages = vmalloc(sizeof(*pages) * npages);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (idx = 0; idx < npages; ) {
+ size_t start;
+ ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
+ npages - idx, &start);
+ if (ret < 0)
+ goto fail;
+
+ iov_iter_advance(&tmp_it, ret);
+ nbytes -= ret;
+ idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
+ }
+
+ BUG_ON(nbytes != 0);
+ *num_pages = npages;
+ *page_align = align;
+ dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
+ return pages;
+fail:
+ ceph_put_page_vector(pages, idx, false);
+ return ERR_PTR(ret);
+}
/*
* Prepare an open request. Preallocate ceph_cap to avoid an
@@ -89,7 +157,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
- cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
+ cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
if (cf == NULL) {
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
return -ENOMEM;
@@ -232,6 +300,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_request *req;
struct dentry *dn;
struct ceph_acls_info acls = {};
+ int mask;
int err;
dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
@@ -267,6 +336,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
acls.pagelist = NULL;
}
}
+
+ mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
+ if (ceph_security_xattr_wanted(dir))
+ mask |= CEPH_CAP_XATTR_SHARED;
+ req->r_args.open.mask = cpu_to_le32(mask);
+
req->r_locked_dir = dir; /* caller holds dir->i_mutex */
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
@@ -329,8 +404,9 @@ int ceph_release(struct inode *inode, struct file *file)
}
enum {
- CHECK_EOF = 1,
- READ_INLINE = 2,
+ HAVE_RETRIED = 1,
+ CHECK_EOF = 2,
+ READ_INLINE = 3,
};
/*
@@ -343,17 +419,15 @@ enum {
static int striped_read(struct inode *inode,
u64 off, u64 len,
struct page **pages, int num_pages,
- int *checkeof, bool o_direct,
- unsigned long buf_align)
+ int *checkeof)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 pos, this_len, left;
- int io_align, page_align;
- int pages_left;
- int read;
+ loff_t i_size;
+ int page_align, pages_left;
+ int read, ret;
struct page **page_pos;
- int ret;
bool hit_stripe, was_short;
/*
@@ -364,13 +438,9 @@ static int striped_read(struct inode *inode,
page_pos = pages;
pages_left = num_pages;
read = 0;
- io_align = off & ~PAGE_MASK;
more:
- if (o_direct)
- page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
- else
- page_align = pos & ~PAGE_MASK;
+ page_align = pos & ~PAGE_MASK;
this_len = left;
ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
&ci->i_layout, pos, &this_len,
@@ -384,20 +454,19 @@ more:
dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+ i_size = i_size_read(inode);
if (ret >= 0) {
int didpages;
- if (was_short && (pos + ret < inode->i_size)) {
- int zlen = min(this_len - ret,
- inode->i_size - pos - ret);
- int zoff = (o_direct ? buf_align : io_align) +
- read + ret;
+ if (was_short && (pos + ret < i_size)) {
+ int zlen = min(this_len - ret, i_size - pos - ret);
+ int zoff = (off & ~PAGE_MASK) + read + ret;
dout(" zero gap %llu to %llu\n",
pos + ret, pos + ret + zlen);
ceph_zero_page_vector_range(zoff, zlen, pages);
ret += zlen;
}
- didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
+ didpages = (page_align + ret) >> PAGE_SHIFT;
pos += ret;
read = pos - off;
left -= ret;
@@ -405,14 +474,14 @@ more:
pages_left -= didpages;
/* hit stripe and need continue*/
- if (left && hit_stripe && pos < inode->i_size)
+ if (left && hit_stripe && pos < i_size)
goto more;
}
if (read > 0) {
ret = read;
/* did we bounce off eof? */
- if (pos + left > inode->i_size)
+ if (pos + left > i_size)
*checkeof = CHECK_EOF;
}
@@ -453,63 +522,222 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
if (ret < 0)
return ret;
- if (iocb->ki_flags & IOCB_DIRECT) {
- while (iov_iter_count(i)) {
- size_t start;
- ssize_t n;
+ num_pages = calc_pages_for(off, len);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ ret = striped_read(inode, off, len, pages,
+ num_pages, checkeof);
+ if (ret > 0) {
+ int l, k = 0;
+ size_t left = ret;
+
+ while (left) {
+ size_t page_off = off & ~PAGE_MASK;
+ size_t copy = min_t(size_t, left,
+ PAGE_SIZE - page_off);
+ l = copy_page_to_iter(pages[k++], page_off, copy, i);
+ off += l;
+ left -= l;
+ if (l < copy)
+ break;
+ }
+ }
+ ceph_release_page_vector(pages, num_pages);
- n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
- if (n < 0)
- return n;
+ if (off > iocb->ki_pos) {
+ ret = off - iocb->ki_pos;
+ iocb->ki_pos = off;
+ }
- num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+ dout("sync_read result %d\n", ret);
+ return ret;
+}
- ret = striped_read(inode, off, n,
- pages, num_pages, checkeof,
- 1, start);
+struct ceph_aio_request {
+ struct kiocb *iocb;
+ size_t total_len;
+ int write;
+ int error;
+ struct list_head osd_reqs;
+ unsigned num_reqs;
+ atomic_t pending_reqs;
+ struct timespec mtime;
+ struct ceph_cap_flush *prealloc_cf;
+};
- ceph_put_page_vector(pages, num_pages, true);
+struct ceph_aio_work {
+ struct work_struct work;
+ struct ceph_osd_request *req;
+};
- if (ret <= 0)
- break;
- off += ret;
- iov_iter_advance(i, ret);
- if (ret < n)
- break;
+static void ceph_aio_retry_work(struct work_struct *work);
+
+static void ceph_aio_complete(struct inode *inode,
+ struct ceph_aio_request *aio_req)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ if (!atomic_dec_and_test(&aio_req->pending_reqs))
+ return;
+
+ ret = aio_req->error;
+ if (!ret)
+ ret = aio_req->total_len;
+
+ dout("ceph_aio_complete %p rc %d\n", inode, ret);
+
+ if (ret >= 0 && aio_req->write) {
+ int dirty;
+
+ loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
+ if (endoff > i_size_read(inode)) {
+ if (ceph_inode_set_size(inode, endoff))
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
- } else {
- num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
- ret = striped_read(inode, off, len, pages,
- num_pages, checkeof, 0, 0);
- if (ret > 0) {
- int l, k = 0;
- size_t left = ret;
-
- while (left) {
- size_t page_off = off & ~PAGE_MASK;
- size_t copy = min_t(size_t,
- PAGE_SIZE - page_off, left);
- l = copy_page_to_iter(pages[k++], page_off,
- copy, i);
- off += l;
- left -= l;
- if (l < copy)
- break;
+
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+ &aio_req->prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+
+ }
+
+ ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
+ CEPH_CAP_FILE_RD));
+
+ aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+
+ ceph_free_cap_flush(aio_req->prealloc_cf);
+ kfree(aio_req);
+}
+
+static void ceph_aio_complete_req(struct ceph_osd_request *req,
+ struct ceph_msg *msg)
+{
+ int rc = req->r_result;
+ struct inode *inode = req->r_inode;
+ struct ceph_aio_request *aio_req = req->r_priv;
+ struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+ int num_pages = calc_pages_for((u64)osd_data->alignment,
+ osd_data->length);
+
+ dout("ceph_aio_complete_req %p rc %d bytes %llu\n",
+ inode, rc, osd_data->length);
+
+ if (rc == -EOLDSNAPC) {
+ struct ceph_aio_work *aio_work;
+ BUG_ON(!aio_req->write);
+
+ aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
+ if (aio_work) {
+ INIT_WORK(&aio_work->work, ceph_aio_retry_work);
+ aio_work->req = req;
+ queue_work(ceph_inode_to_client(inode)->wb_wq,
+ &aio_work->work);
+ return;
+ }
+ rc = -ENOMEM;
+ } else if (!aio_req->write) {
+ if (rc == -ENOENT)
+ rc = 0;
+ if (rc >= 0 && osd_data->length > rc) {
+ int zoff = osd_data->alignment + rc;
+ int zlen = osd_data->length - rc;
+ /*
+ * If read is satisfied by single OSD request,
+ * it can pass EOF. Otherwise read is within
+ * i_size.
+ */
+ if (aio_req->num_reqs == 1) {
+ loff_t i_size = i_size_read(inode);
+ loff_t endoff = aio_req->iocb->ki_pos + rc;
+ if (endoff < i_size)
+ zlen = min_t(size_t, zlen,
+ i_size - endoff);
+ aio_req->total_len = rc + zlen;
}
+
+ if (zlen > 0)
+ ceph_zero_page_vector_range(zoff, zlen,
+ osd_data->pages);
}
- ceph_release_page_vector(pages, num_pages);
}
- if (off > iocb->ki_pos) {
- ret = off - iocb->ki_pos;
- iocb->ki_pos = off;
+ ceph_put_page_vector(osd_data->pages, num_pages, false);
+ ceph_osdc_put_request(req);
+
+ if (rc < 0)
+ cmpxchg(&aio_req->error, 0, rc);
+
+ ceph_aio_complete(inode, aio_req);
+ return;
+}
+
+static void ceph_aio_retry_work(struct work_struct *work)
+{
+ struct ceph_aio_work *aio_work =
+ container_of(work, struct ceph_aio_work, work);
+ struct ceph_osd_request *orig_req = aio_work->req;
+ struct ceph_aio_request *aio_req = orig_req->r_priv;
+ struct inode *inode = orig_req->r_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc;
+ struct ceph_osd_request *req;
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_have_pending_cap_snap(ci)) {
+ struct ceph_cap_snap *capsnap =
+ list_last_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ snapc = ceph_get_snap_context(capsnap->context);
+ } else {
+ BUG_ON(!ci->i_head_snapc);
+ snapc = ceph_get_snap_context(ci->i_head_snapc);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+
+ req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
+ false, GFP_NOFS);
+ if (!req) {
+ ret = -ENOMEM;
+ req = orig_req;
+ goto out;
}
- dout("sync_read result %d\n", ret);
- return ret;
+ req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
+ CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_WRITE;
+ req->r_base_oloc = orig_req->r_base_oloc;
+ req->r_base_oid = orig_req->r_base_oid;
+
+ req->r_ops[0] = orig_req->r_ops[0];
+ osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+
+ ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
+ snapc, CEPH_NOSNAP, &aio_req->mtime);
+
+ ceph_osdc_put_request(orig_req);
+
+ req->r_callback = ceph_aio_complete_req;
+ req->r_inode = inode;
+ req->r_priv = aio_req;
+
+ ret = ceph_osdc_start_request(req->r_osdc, req, false);
+out:
+ if (ret < 0) {
+ req->r_result = ret;
+ ceph_aio_complete_req(req, NULL);
+ }
+
+ ceph_put_snap_context(snapc);
+ kfree(aio_work);
}
/*
@@ -545,16 +773,10 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
}
-/*
- * Synchronous write, straight from __user pointer or user pages.
- *
- * If write spans object boundary, just do multiple writes. (For a
- * correct atomic write, we should e.g. take write locks on all
- * objects, rollback on failure, etc.)
- */
static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
- struct ceph_snap_context *snapc)
+ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct ceph_snap_context *snapc,
+ struct ceph_cap_flush **pcf)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -563,44 +785,52 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct ceph_vino vino;
struct ceph_osd_request *req;
struct page **pages;
- int num_pages;
- int written = 0;
+ struct ceph_aio_request *aio_req = NULL;
+ int num_pages = 0;
int flags;
- int check_caps = 0;
int ret;
- struct timespec mtime = CURRENT_TIME;
- size_t count = iov_iter_count(from);
+ struct timespec mtime = current_fs_time(inode->i_sb);
+ size_t count = iov_iter_count(iter);
+ loff_t pos = iocb->ki_pos;
+ bool write = iov_iter_rw(iter) == WRITE;
- if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+ if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_direct_write on file %p %lld~%u\n", file, pos,
- (unsigned)count);
+ dout("sync_direct_read_write (%s) on file %p %lld~%u\n",
+ (write ? "write" : "read"), file, pos, (unsigned)count);
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
if (ret < 0)
return ret;
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- (pos + count) >> PAGE_CACHE_SHIFT);
- if (ret < 0)
- dout("invalidate_inode_pages2_range returned %d\n", ret);
+ if (write) {
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (pos + count) >> PAGE_SHIFT);
+ if (ret < 0)
+ dout("invalidate_inode_pages2_range returned %d\n", ret);
- flags = CEPH_OSD_FLAG_ORDERSNAP |
- CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE;
+ flags = CEPH_OSD_FLAG_ORDERSNAP |
+ CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_WRITE;
+ } else {
+ flags = CEPH_OSD_FLAG_READ;
+ }
- while (iov_iter_count(from) > 0) {
- u64 len = iov_iter_single_seg_count(from);
- size_t start;
- ssize_t n;
+ while (iov_iter_count(iter) > 0) {
+ u64 size = dio_get_pagev_size(iter);
+ size_t start = 0;
+ ssize_t len;
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, pos, &len, 0,
- 2,/*include a 'startsync' command*/
- CEPH_OSD_OP_WRITE, flags, snapc,
+ vino, pos, &size, 0,
+ /*include a 'startsync' command*/
+ write ? 2 : 1,
+ write ? CEPH_OSD_OP_WRITE :
+ CEPH_OSD_OP_READ,
+ flags, snapc,
ci->i_truncate_seq,
ci->i_truncate_size,
false);
@@ -609,58 +839,136 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
break;
}
- osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
-
- n = iov_iter_get_pages_alloc(from, &pages, len, &start);
- if (unlikely(n < 0)) {
- ret = n;
+ len = size;
+ pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
+ if (IS_ERR(pages)) {
ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
break;
}
- num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
/*
- * throw out any page cache pages in this range. this
- * may block.
+ * To simplify error handling, allow AIO when IO within i_size
+ * or IO can be satisfied by single OSD request.
*/
- truncate_inode_pages_range(inode->i_mapping, pos,
- (pos+n) | (PAGE_CACHE_SIZE-1));
- osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
- false, false);
+ if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
+ (len == count || pos + count <= i_size_read(inode))) {
+ aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
+ if (aio_req) {
+ aio_req->iocb = iocb;
+ aio_req->write = write;
+ INIT_LIST_HEAD(&aio_req->osd_reqs);
+ if (write) {
+ aio_req->mtime = mtime;
+ swap(aio_req->prealloc_cf, *pcf);
+ }
+ }
+ /* ignore error */
+ }
+
+ if (write) {
+ /*
+ * throw out any page cache pages in this range. this
+ * may block.
+ */
+ truncate_inode_pages_range(inode->i_mapping, pos,
+ (pos+len) | (PAGE_SIZE - 1));
+
+ osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+ }
+
+
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
+ false, false);
- /* BUG_ON(vino.snap != CEPH_NOSNAP); */
ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (aio_req) {
+ aio_req->total_len += len;
+ aio_req->num_reqs++;
+ atomic_inc(&aio_req->pending_reqs);
+
+ req->r_callback = ceph_aio_complete_req;
+ req->r_inode = inode;
+ req->r_priv = aio_req;
+ list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
+
+ pos += len;
+ iov_iter_advance(iter, len);
+ continue;
+ }
+
+ ret = ceph_osdc_start_request(req->r_osdc, req, false);
if (!ret)
ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ size = i_size_read(inode);
+ if (!write) {
+ if (ret == -ENOENT)
+ ret = 0;
+ if (ret >= 0 && ret < len && pos + ret < size) {
+ int zlen = min_t(size_t, len - ret,
+ size - pos - ret);
+ ceph_zero_page_vector_range(start + ret, zlen,
+ pages);
+ ret += zlen;
+ }
+ if (ret >= 0)
+ len = ret;
+ }
+
ceph_put_page_vector(pages, num_pages, false);
ceph_osdc_put_request(req);
- if (ret)
+ if (ret < 0)
break;
- pos += n;
- written += n;
- iov_iter_advance(from, n);
- if (pos > i_size_read(inode)) {
- check_caps = ceph_inode_set_size(inode, pos);
- if (check_caps)
+ pos += len;
+ iov_iter_advance(iter, len);
+
+ if (!write && pos >= size)
+ break;
+
+ if (write && pos > size) {
+ if (ceph_inode_set_size(inode, pos))
ceph_check_caps(ceph_inode(inode),
CHECK_CAPS_AUTHONLY,
NULL);
}
}
- if (ret != -EOLDSNAPC && written > 0) {
+ if (aio_req) {
+ if (aio_req->num_reqs == 0) {
+ kfree(aio_req);
+ return ret;
+ }
+
+ ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
+ CEPH_CAP_FILE_RD);
+
+ while (!list_empty(&aio_req->osd_reqs)) {
+ req = list_first_entry(&aio_req->osd_reqs,
+ struct ceph_osd_request,
+ r_unsafe_item);
+ list_del_init(&req->r_unsafe_item);
+ if (ret >= 0)
+ ret = ceph_osdc_start_request(req->r_osdc,
+ req, false);
+ if (ret < 0) {
+ req->r_result = ret;
+ ceph_aio_complete_req(req, NULL);
+ }
+ }
+ return -EIOCBQUEUED;
+ }
+
+ if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
+ ret = pos - iocb->ki_pos;
iocb->ki_pos = pos;
- ret = written;
}
return ret;
}
-
/*
* Synchronous write, straight from __user pointer or user pages.
*
@@ -685,7 +993,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
int flags;
int check_caps = 0;
int ret;
- struct timespec mtime = CURRENT_TIME;
+ struct timespec mtime = current_fs_time(inode->i_sb);
size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -698,8 +1006,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- (pos + count) >> PAGE_CACHE_SHIFT);
+ pos >> PAGE_SHIFT,
+ (pos + count) >> PAGE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
@@ -728,7 +1036,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
* write from beginning of first page,
* regardless of io alignment
*/
- num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
@@ -830,8 +1138,14 @@ again:
ceph_cap_string(got));
if (ci->i_inline_version == CEPH_INLINE_NONE) {
- /* hmm, this isn't really async... */
- ret = ceph_sync_read(iocb, to, &retry_op);
+ if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
+ ret = ceph_direct_read_write(iocb, to,
+ NULL, NULL);
+ if (ret >= 0 && ret < len)
+ retry_op = CHECK_EOF;
+ } else {
+ ret = ceph_sync_read(iocb, to, &retry_op);
+ }
} else {
retry_op = READ_INLINE;
}
@@ -845,11 +1159,11 @@ again:
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
if (pinned_page) {
- page_cache_release(pinned_page);
+ put_page(pinned_page);
pinned_page = NULL;
}
ceph_put_cap_refs(ci, got);
- if (retry_op && ret >= 0) {
+ if (retry_op > HAVE_RETRIED && ret >= 0) {
int statret;
struct page *page = NULL;
loff_t i_size;
@@ -874,10 +1188,10 @@ again:
if (retry_op == READ_INLINE) {
BUG_ON(ret > 0 || read > 0);
if (iocb->ki_pos < i_size &&
- iocb->ki_pos < PAGE_CACHE_SIZE) {
+ iocb->ki_pos < PAGE_SIZE) {
loff_t end = min_t(loff_t, i_size,
iocb->ki_pos + len);
- end = min_t(loff_t, end, PAGE_CACHE_SIZE);
+ end = min_t(loff_t, end, PAGE_SIZE);
if (statret < end)
zero_user_segment(page, statret, end);
ret = copy_page_to_iter(page,
@@ -901,12 +1215,11 @@ again:
if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
ret < len) {
dout("sync_read hit hole, ppos %lld < size %lld"
- ", reading more\n", iocb->ki_pos,
- inode->i_size);
+ ", reading more\n", iocb->ki_pos, i_size);
read += ret;
len -= ret;
- retry_op = 0;
+ retry_op = HAVE_RETRIED;
goto again;
}
}
@@ -947,7 +1260,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!prealloc_cf)
return -ENOMEM;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
@@ -985,7 +1298,7 @@ retry_snap:
}
dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos, count, inode->i_size);
+ inode, ceph_vinop(inode), pos, count, i_size_read(inode));
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
else
@@ -1003,7 +1316,7 @@ retry_snap:
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
struct ceph_snap_context *snapc;
struct iov_iter data;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
spin_lock(&ci->i_ceph_lock);
if (__ceph_have_pending_cap_snap(ci)) {
@@ -1021,8 +1334,8 @@ retry_snap:
/* we might need to revert back to that point */
data = *from;
if (iocb->ki_flags & IOCB_DIRECT)
- written = ceph_sync_direct_write(iocb, &data, pos,
- snapc);
+ written = ceph_direct_read_write(iocb, &data, snapc,
+ &prealloc_cf);
else
written = ceph_sync_write(iocb, &data, pos, snapc);
if (written == -EOLDSNAPC) {
@@ -1030,14 +1343,14 @@ retry_snap:
"got EOLDSNAPC, retrying\n",
inode, ceph_vinop(inode),
pos, (unsigned)count);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
goto retry_snap;
}
if (written > 0)
iov_iter_advance(from, written);
ceph_put_snap_context(snapc);
} else {
- loff_t old_size = inode->i_size;
+ loff_t old_size = i_size_read(inode);
/*
* No need to acquire the i_truncate_mutex. Because
* the MDS revokes Fwb caps before sending truncate
@@ -1048,9 +1361,9 @@ retry_snap:
written = generic_perform_write(file, from, pos);
if (likely(written >= 0))
iocb->ki_pos = pos + written;
- if (inode->i_size > old_size)
+ if (i_size_read(inode) > old_size)
ceph_fscache_update_objectsize(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
if (written >= 0) {
@@ -1080,7 +1393,7 @@ retry_snap:
goto out_unlocked;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out_unlocked:
ceph_free_cap_flush(prealloc_cf);
current->backing_dev_info = NULL;
@@ -1093,9 +1406,10 @@ out_unlocked:
static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
+ loff_t i_size;
int ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
@@ -1105,9 +1419,10 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
}
}
+ i_size = i_size_read(inode);
switch (whence) {
case SEEK_END:
- offset += inode->i_size;
+ offset += i_size;
break;
case SEEK_CUR:
/*
@@ -1123,24 +1438,24 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
offset += file->f_pos;
break;
case SEEK_DATA:
- if (offset >= inode->i_size) {
+ if (offset >= i_size) {
ret = -ENXIO;
goto out;
}
break;
case SEEK_HOLE:
- if (offset >= inode->i_size) {
+ if (offset >= i_size) {
ret = -ENXIO;
goto out;
}
- offset = inode->i_size;
+ offset = i_size;
break;
}
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return offset;
}
@@ -1148,21 +1463,21 @@ static inline void ceph_zero_partial_page(
struct inode *inode, loff_t offset, unsigned size)
{
struct page *page;
- pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+ pgoff_t index = offset >> PAGE_SHIFT;
page = find_lock_page(inode->i_mapping, index);
if (page) {
wait_on_page_writeback(page);
- zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
+ zero_user(page, offset & (PAGE_SIZE - 1), size);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
loff_t length)
{
- loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
+ loff_t nearly = round_up(offset, PAGE_SIZE);
if (offset < nearly) {
loff_t size = nearly - offset;
if (length < size)
@@ -1171,8 +1486,8 @@ static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
offset += size;
length -= size;
}
- if (length >= PAGE_CACHE_SIZE) {
- loff_t size = round_down(length, PAGE_CACHE_SIZE);
+ if (length >= PAGE_SIZE) {
+ loff_t size = round_down(length, PAGE_SIZE);
truncate_pagecache_range(inode, offset, offset + size - 1);
offset += size;
length -= size;
@@ -1296,7 +1611,7 @@ static long ceph_fallocate(struct file *file, int mode,
if (!prealloc_cf)
return -ENOMEM;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (ceph_snap(inode) != CEPH_NOSNAP) {
ret = -EROFS;
@@ -1351,7 +1666,7 @@ static long ceph_fallocate(struct file *file, int mode,
ceph_put_cap_refs(ci, got);
unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ceph_free_cap_flush(prealloc_cf);
return ret;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 96d2bd829902..edfade037738 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+ ci->i_pool_ns_len = 0;
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -452,6 +453,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_unsafe_writes);
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+ INIT_LIST_HEAD(&ci->i_unsafe_iops);
spin_lock_init(&ci->i_unsafe_lock);
ci->i_snap_realm = NULL;
@@ -547,7 +549,11 @@ int ceph_fill_file_size(struct inode *inode, int issued,
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
(truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
dout("size %lld -> %llu\n", inode->i_size, size);
- inode->i_size = size;
+ if (size > 0 && S_ISDIR(inode->i_mode)) {
+ pr_err("fill_file_size non-zero size for directory\n");
+ size = 0;
+ }
+ i_size_write(inode, size);
inode->i_blocks = (size + (1<<9) - 1) >> 9;
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
@@ -755,6 +761,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
ci->i_layout = info->layout;
+ ci->i_pool_ns_len = iinfo->pool_ns_len;
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
@@ -807,7 +814,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
err = -EINVAL;
- if (WARN_ON(symlen != inode->i_size))
+ if (WARN_ON(symlen != i_size_read(inode)))
goto out;
err = -ENOMEM;
@@ -974,13 +981,8 @@ out_unlock:
/*
* splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe.
- *
- * we will only rehash the resulting dentry if @prehash is
- * true; @prehash will be set to false (for the benefit of
- * the caller) if we fail.
*/
-static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
- bool *prehash)
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
{
struct dentry *realdn;
@@ -993,8 +995,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
if (IS_ERR(realdn)) {
pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
PTR_ERR(realdn), dn, in, ceph_vinop(in));
- if (prehash)
- *prehash = false; /* don't rehash on error */
dn = realdn; /* note realdn contains the error */
goto out;
} else if (realdn) {
@@ -1010,8 +1010,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
dout("dn %p attached to %p ino %llx.%llx\n",
dn, d_inode(dn), ceph_vinop(d_inode(dn)));
}
- if ((!prehash || *prehash) && d_unhashed(dn))
- d_rehash(dn);
out:
return dn;
}
@@ -1244,10 +1242,8 @@ retry_lookup:
dout("d_delete %p\n", dn);
d_delete(dn);
} else {
- dout("d_instantiate %p NULL\n", dn);
- d_instantiate(dn, NULL);
if (have_lease && d_unhashed(dn))
- d_rehash(dn);
+ d_add(dn, NULL);
update_dentry_lease(dn, rinfo->dlease,
session,
req->r_request_started);
@@ -1259,7 +1255,7 @@ retry_lookup:
if (d_really_is_negative(dn)) {
ceph_dir_clear_ordered(dir);
ihold(in);
- dn = splice_dentry(dn, in, &have_lease);
+ dn = splice_dentry(dn, in);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
goto done;
@@ -1269,6 +1265,7 @@ retry_lookup:
dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
dn, d_inode(dn), ceph_vinop(d_inode(dn)),
ceph_vinop(in));
+ d_invalidate(dn);
have_lease = false;
}
@@ -1289,7 +1286,7 @@ retry_lookup:
dout(" linking snapped dir %p to dn %p\n", in, dn);
ceph_dir_clear_ordered(dir);
ihold(in);
- dn = splice_dentry(dn, in, NULL);
+ dn = splice_dentry(dn, in);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
goto done;
@@ -1341,7 +1338,7 @@ void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
if (ctl->page) {
kunmap(ctl->page);
- page_cache_release(ctl->page);
+ put_page(ctl->page);
ctl->page = NULL;
}
}
@@ -1351,21 +1348,26 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
struct ceph_mds_request *req)
{
struct ceph_inode_info *ci = ceph_inode(dir);
- unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
+ unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
if (!ctl->page || pgoff != page_index(ctl->page)) {
ceph_readdir_cache_release(ctl);
- ctl->page = grab_cache_page(&dir->i_data, pgoff);
+ if (idx == 0)
+ ctl->page = grab_cache_page(&dir->i_data, pgoff);
+ else
+ ctl->page = find_lock_page(&dir->i_data, pgoff);
if (!ctl->page) {
ctl->index = -1;
- return -ENOMEM;
+ return idx == 0 ? -ENOMEM : 0;
}
/* reading/filling the cache are serialized by
* i_mutex, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
+ if (idx == 0)
+ memset(ctl->dentries, 0, PAGE_SIZE);
}
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
@@ -1388,7 +1390,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct qstr dname;
struct dentry *dn;
struct inode *in;
- int err = 0, ret, i;
+ int err = 0, skipped = 0, ret, i;
struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di;
@@ -1500,7 +1502,17 @@ retry_lookup:
}
if (d_really_is_negative(dn)) {
- struct dentry *realdn = splice_dentry(dn, in, NULL);
+ struct dentry *realdn;
+
+ if (ceph_security_xattr_deadlock(in)) {
+ dout(" skip splicing dn %p to inode %p"
+ " (security xattr deadlock)\n", dn, in);
+ iput(in);
+ skipped++;
+ goto next_item;
+ }
+
+ realdn = splice_dentry(dn, in);
if (IS_ERR(realdn)) {
err = PTR_ERR(realdn);
d_drop(dn);
@@ -1517,7 +1529,7 @@ retry_lookup:
req->r_session,
req->r_request_started);
- if (err == 0 && cache_ctl.index >= 0) {
+ if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn,
&cache_ctl, req);
if (ret < 0)
@@ -1528,7 +1540,7 @@ next_item:
dput(dn);
}
out:
- if (err == 0) {
+ if (err == 0 && skipped == 0) {
req->r_did_prepopulate = true;
req->r_readdir_cache_idx = cache_ctl.index;
}
@@ -1548,7 +1560,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
spin_lock(&ci->i_ceph_lock);
dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
- inode->i_size = size;
+ i_size_write(inode, size);
inode->i_blocks = (size + (1 << 9) - 1) >> 9;
/* tell the MDS if we are approaching max_size */
@@ -1755,7 +1767,7 @@ retry:
*/
static const struct inode_operations ceph_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ceph_setattr,
.getattr = ceph_getattr,
.setxattr = ceph_setxattr,
@@ -1910,7 +1922,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
inode->i_size, attr->ia_size);
if ((issued & CEPH_CAP_FILE_EXCL) &&
attr->ia_size > inode->i_size) {
- inode->i_size = attr->ia_size;
+ i_size_write(inode, attr->ia_size);
inode->i_blocks =
(attr->ia_size + (1 << 9) - 1) >> 9;
inode->i_ctime = attr->ia_ctime;
@@ -1958,7 +1970,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
}
release &= issued;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 51cb02da75d9..541ead4d8965 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -100,6 +100,14 @@ static int parse_reply_info_in(void **p, void *end,
} else
info->inline_version = CEPH_INLINE_NONE;
+ if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+ ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ *p += info->pool_ns_len;
+ } else {
+ info->pool_ns_len = 0;
+ }
+
return 0;
bad:
return err;
@@ -633,13 +641,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
mdsc->oldest_tid = req->r_tid;
if (dir) {
- struct ceph_inode_info *ci = ceph_inode(dir);
-
ihold(dir);
- spin_lock(&ci->i_unsafe_lock);
req->r_unsafe_dir = dir;
- list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
- spin_unlock(&ci->i_unsafe_lock);
}
}
@@ -665,13 +668,20 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
rb_erase(&req->r_node, &mdsc->request_tree);
RB_CLEAR_NODE(&req->r_node);
- if (req->r_unsafe_dir) {
+ if (req->r_unsafe_dir && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
-
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_target_inode && req->r_got_unsafe) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_target_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_unsafe_dir) {
iput(req->r_unsafe_dir);
req->r_unsafe_dir = NULL;
}
@@ -1430,6 +1440,13 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if ((used | wanted) & CEPH_CAP_ANY_WR)
goto out;
}
+ /* The inode has cached pages, but it's no longer used.
+ * we can safely drop it */
+ if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+ !(oissued & CEPH_CAP_FILE_CACHE)) {
+ used = 0;
+ oissued = 0;
+ }
if ((used | wanted) & ~oissued & mine)
goto out; /* we need these caps */
@@ -1438,7 +1455,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
/* we aren't the only cap.. just remove us */
__ceph_remove_cap(cap, true);
} else {
- /* try to drop referring dentries */
+ /* try dropping referring dentries */
spin_unlock(&ci->i_ceph_lock);
d_prune_aliases(inode);
dout("trim_caps_cb %p cap %p pruned, count now %d\n",
@@ -1593,7 +1610,7 @@ again:
while (!list_empty(&tmp_list)) {
if (!msg) {
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
- PAGE_CACHE_SIZE, GFP_NOFS, false);
+ PAGE_SIZE, GFP_NOFS, false);
if (!msg)
goto out_err;
head = msg->front.iov_base;
@@ -1704,6 +1721,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
req->r_started = jiffies;
req->r_resend_mds = -1;
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+ INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
kref_init(&req->r_kref);
INIT_LIST_HEAD(&req->r_wait);
@@ -1711,7 +1729,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
- req->r_stamp = CURRENT_TIME;
+ req->r_stamp = current_fs_time(mdsc->fsc->sb);
req->r_op = op;
req->r_direct_mode = mode;
@@ -1935,7 +1953,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
len = sizeof(*head) +
pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
- sizeof(struct timespec);
+ sizeof(struct ceph_timespec);
/* calculate (max) length for cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -2288,6 +2306,14 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
+ /* deny access to directories with pool_ns layouts */
+ if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
+ ceph_inode(req->r_inode)->i_pool_ns_len)
+ return -EIO;
+ if (req->r_locked_dir &&
+ ceph_inode(req->r_locked_dir)->i_pool_ns_len)
+ return -EIO;
+
/* issue */
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
@@ -2477,6 +2503,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
req->r_got_unsafe = true;
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+ if (req->r_unsafe_dir) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_unsafe_dir);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_dir_item,
+ &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
}
dout("handle_reply tid %lld result %d\n", tid, result);
@@ -2506,6 +2540,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* insert trace into our cache */
mutex_lock(&req->r_fill_mutex);
+ current->journal_info = req;
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
@@ -2513,11 +2548,19 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
+ current->journal_info = NULL;
mutex_unlock(&req->r_fill_mutex);
up_read(&mdsc->snap_rwsem);
if (realm)
ceph_put_snap_realm(mdsc, realm);
+
+ if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
out_err:
mutex_lock(&mdsc->mutex);
if (!req->r_aborted) {
@@ -3723,7 +3766,6 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
/* do we need it? */
- ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
mutex_lock(&mdsc->mutex);
if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
dout("handle_map epoch %u <= our %u\n",
@@ -3750,6 +3792,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
__wake_requests(mdsc, &mdsc->waiting_for_map);
+ ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
+ mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex);
schedule_delayed(mdsc);
@@ -3917,17 +3961,19 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
return msg;
}
-static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
+static int mds_sign_message(struct ceph_msg *msg)
{
- struct ceph_mds_session *s = con->private;
+ struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
+
return ceph_auth_sign_message(auth, msg);
}
-static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
+static int mds_check_message_signature(struct ceph_msg *msg)
{
- struct ceph_mds_session *s = con->private;
+ struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
+
return ceph_auth_check_message_signature(auth, msg);
}
@@ -3940,8 +3986,8 @@ static const struct ceph_connection_operations mds_con_ops = {
.invalidate_authorizer = invalidate_authorizer,
.peer_reset = peer_reset,
.alloc_msg = mds_alloc_msg,
- .sign_message = sign_message,
- .check_message_signature = check_message_signature,
+ .sign_message = mds_sign_message,
+ .check_message_signature = mds_check_message_signature,
};
/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f575eafe2261..ee69a537dba5 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -44,6 +44,7 @@ struct ceph_mds_reply_info_in {
u64 inline_version;
u32 inline_len;
char *inline_data;
+ u32 pool_ns_len;
};
/*
@@ -96,7 +97,7 @@ struct ceph_mds_reply_info_parsed {
/*
* cap releases are batched and sent to the MDS en masse.
*/
-#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
+#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - \
sizeof(struct ceph_mds_cap_release)) / \
sizeof(struct ceph_mds_cap_item))
@@ -236,6 +237,9 @@ struct ceph_mds_request {
struct inode *r_unsafe_dir;
struct list_head r_unsafe_dir_item;
+ /* unsafe requests that modify the target inode */
+ struct list_head r_unsafe_target_item;
+
struct ceph_mds_session *r_session;
int r_attempts; /* resend attempts */
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 4aa7122a8d38..9caaa7ffc93f 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -296,8 +296,6 @@ static int cmpu64_rev(const void *a, const void *b)
}
-struct ceph_snap_context *ceph_empty_snapc;
-
/*
* build the snap context for a given realm.
*/
@@ -987,17 +985,3 @@ out:
up_write(&mdsc->snap_rwsem);
return;
}
-
-int __init ceph_snap_init(void)
-{
- ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
- if (!ceph_empty_snapc)
- return -ENOMEM;
- ceph_empty_snapc->seq = 1;
- return 0;
-}
-
-void ceph_snap_exit(void)
-{
- ceph_put_snap_context(ceph_empty_snapc);
-}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f446afada328..f12d5e2955c2 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -439,8 +439,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
seq_puts(m, ",dirstat");
- if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
- seq_puts(m, ",norbytes");
+ if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
+ seq_puts(m, ",rbytes");
if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
seq_puts(m, ",noasyncreaddir");
if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
@@ -530,7 +530,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->monc.want_mdsmap = 1;
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
fsc->mount_options = fsopt;
@@ -560,7 +560,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
/* set up mempools */
err = -ENOMEM;
- page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
+ page_count = fsc->mount_options->wsize >> PAGE_SHIFT;
size = sizeof (struct page *) * (page_count ? page_count : 1);
fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
if (!fsc->wb_pagevec_pool)
@@ -639,8 +639,8 @@ static int __init init_caches(void)
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
sizeof(struct ceph_inode_info),
__alignof__(struct ceph_inode_info),
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
- ceph_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ceph_inode_init_once);
if (ceph_inode_cachep == NULL)
return -ENOMEM;
@@ -793,22 +793,20 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
struct dentry *root;
int first = 0; /* first vfsmount for this super_block */
- dout("mount start\n");
+ dout("mount start %p\n", fsc);
mutex_lock(&fsc->client->mount_mutex);
- err = __ceph_open_session(fsc->client, started);
- if (err < 0)
- goto out;
+ if (!fsc->sb->s_root) {
+ err = __ceph_open_session(fsc->client, started);
+ if (err < 0)
+ goto out;
- dout("mount opening root\n");
- root = open_root_dentry(fsc, "", started);
- if (IS_ERR(root)) {
- err = PTR_ERR(root);
- goto out;
- }
- if (fsc->sb->s_root) {
- dput(root);
- } else {
+ dout("mount opening root\n");
+ root = open_root_dentry(fsc, "", started);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out;
+ }
fsc->sb->s_root = root;
first = 1;
@@ -818,6 +816,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
}
if (path[0] == 0) {
+ root = fsc->sb->s_root;
dget(root);
} else {
dout("mount opening base mountpoint\n");
@@ -833,16 +832,14 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
mutex_unlock(&fsc->client->mount_mutex);
return root;
-out:
- mutex_unlock(&fsc->client->mount_mutex);
- return ERR_PTR(err);
-
fail:
if (first) {
dput(fsc->sb->s_root);
fsc->sb->s_root = NULL;
}
- goto out;
+out:
+ mutex_unlock(&fsc->client->mount_mutex);
+ return ERR_PTR(err);
}
static int ceph_set_super(struct super_block *s, void *data)
@@ -915,13 +912,13 @@ static int ceph_register_bdi(struct super_block *sb,
int err;
/* set ra_pages based on rasize mount option? */
- if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
+ if (fsc->mount_options->rasize >= PAGE_SIZE)
fsc->backing_dev_info.ra_pages =
- (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
+ (fsc->mount_options->rasize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else
fsc->backing_dev_info.ra_pages =
- VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
+ VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
atomic_long_inc_return(&bdi_seq));
@@ -1042,19 +1039,14 @@ static int __init init_ceph(void)
ceph_flock_init();
ceph_xattr_init();
- ret = ceph_snap_init();
- if (ret)
- goto out_xattr;
ret = register_filesystem(&ceph_fs_type);
if (ret)
- goto out_snap;
+ goto out_xattr;
pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
return 0;
-out_snap:
- ceph_snap_exit();
out_xattr:
ceph_xattr_exit();
destroy_caches();
@@ -1066,7 +1058,6 @@ static void __exit exit_ceph(void)
{
dout("exit_ceph\n");
unregister_filesystem(&ceph_fs_type);
- ceph_snap_exit();
ceph_xattr_exit();
destroy_caches();
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2f2460d23a06..e705c4d612d7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -37,8 +37,7 @@
#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
-#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \
- CEPH_MOUNT_OPT_DCACHE)
+#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
#define ceph_set_mount_opt(fsc, opt) \
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
@@ -287,6 +286,7 @@ struct ceph_inode_info {
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
+ size_t i_pool_ns_len;
char *i_symlink;
/* for dirs */
@@ -342,6 +342,7 @@ struct ceph_inode_info {
struct list_head i_unsafe_writes; /* uncommitted sync writes */
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
@@ -467,7 +468,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */
#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
-
+#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
@@ -719,7 +720,6 @@ static inline int default_congestion_kb(void)
/* snap.c */
-extern struct ceph_snap_context *ceph_empty_snapc;
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino);
extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
@@ -736,8 +736,6 @@ extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
-extern int ceph_snap_init(void);
-extern void ceph_snap_exit(void);
/*
* a cap_snap is "pending" if it is still awaiting an in-progress
@@ -806,6 +804,20 @@ extern void __init ceph_xattr_init(void);
extern void ceph_xattr_exit(void);
extern const struct xattr_handler *ceph_xattr_handlers[];
+#ifdef CONFIG_SECURITY
+extern bool ceph_security_xattr_deadlock(struct inode *in);
+extern bool ceph_security_xattr_wanted(struct inode *in);
+#else
+static inline bool ceph_security_xattr_deadlock(struct inode *in)
+{
+ return false;
+}
+static inline bool ceph_security_xattr_wanted(struct inode *in)
+{
+ return false;
+}
+#endif
+
/* acl.c */
struct ceph_acls_info {
void *default_acl;
@@ -945,7 +957,6 @@ extern void ceph_dentry_lru_touch(struct dentry *dn);
extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
-extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
/*
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 819163d8313b..9410abdef3ce 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -714,31 +714,62 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
}
}
+static inline int __get_request_mask(struct inode *in) {
+ struct ceph_mds_request *req = current->journal_info;
+ int mask = 0;
+ if (req && req->r_target_inode == in) {
+ if (req->r_op == CEPH_MDS_OP_LOOKUP ||
+ req->r_op == CEPH_MDS_OP_LOOKUPINO ||
+ req->r_op == CEPH_MDS_OP_LOOKUPPARENT ||
+ req->r_op == CEPH_MDS_OP_GETATTR) {
+ mask = le32_to_cpu(req->r_args.getattr.mask);
+ } else if (req->r_op == CEPH_MDS_OP_OPEN ||
+ req->r_op == CEPH_MDS_OP_CREATE) {
+ mask = le32_to_cpu(req->r_args.open.mask);
+ }
+ }
+ return mask;
+}
+
ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- int err;
struct ceph_inode_xattr *xattr;
struct ceph_vxattr *vxattr = NULL;
+ int req_mask;
+ int err;
if (!ceph_is_valid_xattr(name))
return -ENODATA;
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
- if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
- err = vxattr->getxattr_cb(ci, value, size);
+ if (vxattr) {
+ err = -ENODATA;
+ if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
+ err = vxattr->getxattr_cb(ci, value, size);
return err;
}
+ req_mask = __get_request_mask(inode);
+
spin_lock(&ci->i_ceph_lock);
dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
ci->i_xattrs.version, ci->i_xattrs.index_version);
if (ci->i_xattrs.version == 0 ||
- !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
+ !((req_mask & CEPH_CAP_XATTR_SHARED) ||
+ __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) {
spin_unlock(&ci->i_ceph_lock);
+
+ /* security module gets xattr while filling trace */
+ if (current->journal_info != NULL) {
+ pr_warn_ratelimited("sync getxattr %p "
+ "during filling trace\n", inode);
+ return -EBUSY;
+ }
+
/* get xattrs from mds (if we don't already have them) */
err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
if (err)
@@ -765,6 +796,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
memcpy(value, xattr->val, xattr->val_len);
+ if (current->journal_info != NULL &&
+ !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
+ ci->i_ceph_flags |= CEPH_I_SEC_INITED;
out:
spin_unlock(&ci->i_ceph_lock);
return err;
@@ -999,7 +1033,7 @@ retry:
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
}
spin_unlock(&ci->i_ceph_lock);
@@ -1015,7 +1049,15 @@ do_sync:
do_sync_unlocked:
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
- err = ceph_sync_setxattr(dentry, name, value, size, flags);
+
+ /* security module set xattr while filling trace */
+ if (current->journal_info != NULL) {
+ pr_warn_ratelimited("sync setxattr %p "
+ "during filling trace\n", inode);
+ err = -EBUSY;
+ } else {
+ err = ceph_sync_setxattr(dentry, name, value, size, flags);
+ }
out:
ceph_free_cap_flush(prealloc_cf);
kfree(newname);
@@ -1136,7 +1178,7 @@ retry:
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
+ inode->i_ctime = current_fs_time(inode->i_sb);
spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem);
@@ -1164,3 +1206,25 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
return __ceph_removexattr(dentry, name);
}
+
+#ifdef CONFIG_SECURITY
+bool ceph_security_xattr_wanted(struct inode *in)
+{
+ return in->i_security != NULL;
+}
+
+bool ceph_security_xattr_deadlock(struct inode *in)
+{
+ struct ceph_inode_info *ci;
+ bool ret;
+ if (in->i_security == NULL)
+ return false;
+ ci = ceph_inode(in);
+ spin_lock(&ci->i_ceph_lock);
+ ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) &&
+ !(ci->i_xattrs.version > 0 &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0));
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+#endif
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7febcf2475c5..788e19195991 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -50,7 +50,7 @@ void cifs_vfs_err(const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- pr_err("CIFS VFS: %pV", &vaf);
+ pr_err_ratelimited("CIFS VFS: %pV", &vaf);
va_end(args);
}
@@ -255,7 +255,6 @@ static const struct file_operations cifs_debug_data_proc_fops = {
static ssize_t cifs_stats_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
bool bv;
int rc;
struct list_head *tmp1, *tmp2, *tmp3;
@@ -263,11 +262,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
struct cifs_ses *ses;
struct cifs_tcon *tcon;
- rc = get_user(c, buffer);
- if (rc)
- return rc;
-
- if (strtobool(&c, &bv) == 0) {
+ rc = kstrtobool_from_user(buffer, count, &bv);
+ if (rc == 0) {
#ifdef CONFIG_CIFS_STATS2
atomic_set(&totBufAllocCount, 0);
atomic_set(&totSmBufAllocCount, 0);
@@ -290,6 +286,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
}
}
spin_unlock(&cifs_tcp_ses_lock);
+ } else {
+ return rc;
}
return count;
@@ -433,17 +431,17 @@ static int cifsFYI_proc_open(struct inode *inode, struct file *file)
static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- char c;
+ char c[2] = { '\0' };
bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = get_user(c[0], buffer);
if (rc)
return rc;
- if (strtobool(&c, &bv) == 0)
+ if (strtobool(c, &bv) == 0)
cifsFYI = bv;
- else if ((c > '1') && (c <= '9'))
- cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */
+ else if ((c[0] > '1') && (c[0] <= '9'))
+ cifsFYI = (int) (c[0] - '0'); /* see cifs_debug.h for meanings */
return count;
}
@@ -471,20 +469,12 @@ static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
static ssize_t cifs_linux_ext_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &linuxExtEnabled);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- linuxExtEnabled = bv;
-
return count;
}
@@ -511,20 +501,12 @@ static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
static ssize_t cifs_lookup_cache_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &lookupCacheEnabled);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- lookupCacheEnabled = bv;
-
return count;
}
@@ -551,20 +533,12 @@ static int traceSMB_proc_open(struct inode *inode, struct file *file)
static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- char c;
- bool bv;
int rc;
- rc = get_user(c, buffer);
+ rc = kstrtobool_from_user(buffer, count, &traceSMB);
if (rc)
return rc;
- rc = strtobool(&c, &bv);
- if (rc)
- return rc;
-
- traceSMB = bv;
-
return count;
}
@@ -622,7 +596,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
int rc;
unsigned int flags;
char flags_string[12];
- char c;
bool bv;
if ((count < 1) || (count > 11))
@@ -635,11 +608,10 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
if (count < 3) {
/* single char or single char followed by null */
- c = flags_string[0];
- if (strtobool(&c, &bv) == 0) {
+ if (strtobool(flags_string, &bv) == 0) {
global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF;
return count;
- } else if (!isdigit(c)) {
+ } else if (!isdigit(flags_string[0])) {
cifs_dbg(VFS, "Invalid SecurityFlags: %s\n",
flags_string);
return -EINVAL;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index f40fbaca1b2a..c611ca2339d7 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -25,7 +25,7 @@
void cifs_dump_mem(char *label, void *data, int length);
void cifs_dump_detail(void *);
void cifs_dump_mids(struct TCP_Server_Info *);
-extern int traceSMB; /* flag which enables the function below */
+extern bool traceSMB; /* flag which enables the function below */
void dump_smb(void *, int);
#define CIFS_INFO 0x01
#define CIFS_RC 0x02
@@ -51,14 +51,13 @@ __printf(1, 2) void cifs_vfs_err(const char *fmt, ...);
/* information message: e.g., configuration, major event */
#define cifs_dbg(type, fmt, ...) \
do { \
- if (type == FYI) { \
- if (cifsFYI & CIFS_INFO) { \
- pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__); \
- } \
+ if (type == FYI && cifsFYI & CIFS_INFO) { \
+ pr_debug_ratelimited("%s: " \
+ fmt, __FILE__, ##__VA_ARGS__); \
} else if (type == VFS) { \
cifs_vfs_err(fmt, ##__VA_ARGS__); \
} else if (type == NOISY && type != 0) { \
- pr_debug(fmt, ##__VA_ARGS__); \
+ pr_debug_ratelimited(fmt, ##__VA_ARGS__); \
} \
} while (0)
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 7dc886c9a78f..e956cba94338 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -175,7 +175,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
* string to the length of the original string to allow for worst case.
*/
md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
- mountdata = kzalloc(md_len + 1, GFP_KERNEL);
+ mountdata = kzalloc(md_len + sizeof("ip=") + 1, GFP_KERNEL);
if (mountdata == NULL) {
rc = -ENOMEM;
goto compose_mount_options_err;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index afa09fce8151..4897dacf8944 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -33,6 +33,7 @@
#include <linux/ctype.h>
#include <linux/random.h>
#include <linux/highmem.h>
+#include <crypto/skcipher.h>
static int
cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
@@ -714,7 +715,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
if (!ses->auth_key.response) {
- rc = ENOMEM;
+ rc = -ENOMEM;
ses->auth_key.len = 0;
goto setup_ntlmv2_rsp_ret;
}
@@ -789,38 +790,46 @@ int
calc_seckey(struct cifs_ses *ses)
{
int rc;
- struct crypto_blkcipher *tfm_arc4;
+ struct crypto_skcipher *tfm_arc4;
struct scatterlist sgin, sgout;
- struct blkcipher_desc desc;
+ struct skcipher_request *req;
unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
- tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+ tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_arc4)) {
rc = PTR_ERR(tfm_arc4);
cifs_dbg(VFS, "could not allocate crypto API arc4\n");
return rc;
}
- desc.tfm = tfm_arc4;
-
- rc = crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response,
+ rc = crypto_skcipher_setkey(tfm_arc4, ses->auth_key.response,
CIFS_SESS_KEY_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not set response as a key\n",
__func__);
- return rc;
+ goto out_free_cipher;
+ }
+
+ req = skcipher_request_alloc(tfm_arc4, GFP_KERNEL);
+ if (!req) {
+ rc = -ENOMEM;
+ cifs_dbg(VFS, "could not allocate crypto API arc4 request\n");
+ goto out_free_cipher;
}
sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE);
sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE);
- rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sgin, &sgout, CIFS_CPHTXT_SIZE, NULL);
+
+ rc = crypto_skcipher_encrypt(req);
+ skcipher_request_free(req);
if (rc) {
cifs_dbg(VFS, "could not encrypt session key rc: %d\n", rc);
- crypto_free_blkcipher(tfm_arc4);
- return rc;
+ goto out_free_cipher;
}
/* make secondary_key/nonce as session key */
@@ -828,7 +837,8 @@ calc_seckey(struct cifs_ses *ses)
/* and make len as that of session key only */
ses->auth_key.len = CIFS_SESS_KEY_SIZE;
- crypto_free_blkcipher(tfm_arc4);
+out_free_cipher:
+ crypto_free_skcipher(tfm_arc4);
return rc;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e739950ca084..89201564c346 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -54,10 +54,10 @@
#endif
int cifsFYI = 0;
-int traceSMB = 0;
+bool traceSMB;
bool enable_oplocks = true;
-unsigned int linuxExtEnabled = 1;
-unsigned int lookupCacheEnabled = 1;
+bool linuxExtEnabled = true;
+bool lookupCacheEnabled = true;
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
unsigned int sign_CIFS_PDUs = 1;
@@ -454,6 +454,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",nocase");
if (tcon->retry)
seq_puts(s, ",hard");
+ if (tcon->use_persistent)
+ seq_puts(s, ",persistenthandles");
+ else if (tcon->use_resilient)
+ seq_puts(s, ",resilienthandles");
if (tcon->unix_ext)
seq_puts(s, ",unix");
else
@@ -503,6 +507,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rsize=%u", cifs_sb->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->wsize);
+ seq_printf(s, ",echo_interval=%lu",
+ tcon->ses->server->echo_interval / HZ);
/* convert actimeo and display it in seconds */
seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
@@ -636,9 +642,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
while (*s && *s != sep)
s++;
- mutex_lock(&dir->i_mutex);
- child = lookup_one_len(p, dentry, s - p);
- mutex_unlock(&dir->i_mutex);
+ child = lookup_one_len_unlocked(p, dentry, s - p);
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
@@ -748,6 +752,9 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t rc;
struct inode *inode = file_inode(iocb->ki_filp);
+ if (iocb->ki_filp->f_flags & O_DIRECT)
+ return cifs_user_readv(iocb, iter);
+
rc = cifs_revalidate_mapping(inode);
if (rc)
return rc;
@@ -762,6 +769,18 @@ static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t written;
int rc;
+ if (iocb->ki_filp->f_flags & O_DIRECT) {
+ written = cifs_user_writev(iocb, from);
+ if (written > 0 && CIFS_CACHE_READ(cinode)) {
+ cifs_zap_mapping(inode);
+ cifs_dbg(FYI,
+ "Set no oplock for inode=%p after a write operation\n",
+ inode);
+ cinode->oplock = 0;
+ }
+ return written;
+ }
+
written = cifs_get_writer(cinode);
if (written)
return written;
@@ -896,8 +915,7 @@ const struct inode_operations cifs_file_inode_ops = {
const struct inode_operations cifs_symlink_inode_ops = {
.readlink = generic_readlink,
- .follow_link = cifs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = cifs_get_link,
.permission = cifs_permission,
/* BB add the following two eventually */
/* revalidate: cifs_revalidate,
@@ -910,6 +928,59 @@ const struct inode_operations cifs_symlink_inode_ops = {
#endif
};
+static int cifs_clone_file_range(struct file *src_file, loff_t off,
+ struct file *dst_file, loff_t destoff, u64 len)
+{
+ struct inode *src_inode = file_inode(src_file);
+ struct inode *target_inode = file_inode(dst_file);
+ struct cifsFileInfo *smb_file_src = src_file->private_data;
+ struct cifsFileInfo *smb_file_target = dst_file->private_data;
+ struct cifs_tcon *target_tcon = tlink_tcon(smb_file_target->tlink);
+ unsigned int xid;
+ int rc;
+
+ cifs_dbg(FYI, "clone range\n");
+
+ xid = get_xid();
+
+ if (!src_file->private_data || !dst_file->private_data) {
+ rc = -EBADF;
+ cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
+ goto out;
+ }
+
+ /*
+ * Note: cifs case is easier than btrfs since server responsible for
+ * checks for proper open modes and file type and if it wants
+ * server could even support copy of range where source = target
+ */
+ lock_two_nondirectories(target_inode, src_inode);
+
+ if (len == 0)
+ len = src_inode->i_size - off;
+
+ cifs_dbg(FYI, "about to flush pages\n");
+ /* should we flush first and last page first */
+ truncate_inode_pages_range(&target_inode->i_data, destoff,
+ PAGE_ALIGN(destoff + len)-1);
+
+ if (target_tcon->ses->server->ops->duplicate_extents)
+ rc = target_tcon->ses->server->ops->duplicate_extents(xid,
+ smb_file_src, smb_file_target, off, len, destoff);
+ else
+ rc = -EOPNOTSUPP;
+
+ /* force revalidate of size and timestamps of target file now
+ that target is updated on the server */
+ CIFS_I(target_inode)->time = 0;
+ /* although unlocking in the reverse order from locking is not
+ strictly necessary here it is a little cleaner to be consistent */
+ unlock_two_nondirectories(src_inode, target_inode);
+out:
+ free_xid(xid);
+ return rc;
+}
+
const struct file_operations cifs_file_ops = {
.read_iter = cifs_loose_read_iter,
.write_iter = cifs_file_write_iter,
@@ -921,9 +992,8 @@ const struct file_operations cifs_file_ops = {
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -939,9 +1009,8 @@ const struct file_operations cifs_file_strict_ops = {
.mmap = cifs_file_strict_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -957,9 +1026,8 @@ const struct file_operations cifs_file_direct_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -975,9 +1043,8 @@ const struct file_operations cifs_file_nobrl_ops = {
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -992,9 +1059,8 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.mmap = cifs_file_strict_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -1009,9 +1075,8 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
+ .clone_file_range = cifs_clone_file_range,
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -1022,6 +1087,7 @@ const struct file_operations cifs_dir_ops = {
.release = cifs_closedir,
.read = generic_read_dir,
.unlocked_ioctl = cifs_ioctl,
+ .clone_file_range = cifs_clone_file_range,
.llseek = generic_file_llseek,
};
@@ -1040,7 +1106,7 @@ cifs_init_inodecache(void)
cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
sizeof(struct cifsInodeInfo),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
cifs_init_once);
if (cifs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c3cc1609025f..83aac8ba50b0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -31,19 +31,15 @@
* so that it will fit. We use hash_64 to convert the value to 31 bits, and
* then add 1, to ensure that we don't end up with a 0 as the value.
*/
-#if BITS_PER_LONG == 64
static inline ino_t
cifs_uniqueid_to_ino_t(u64 fileid)
{
+ if ((sizeof(ino_t)) < (sizeof(u64)))
+ return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+
return (ino_t)fileid;
+
}
-#else
-static inline ino_t
-cifs_uniqueid_to_ino_t(u64 fileid)
-{
- return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
-}
-#endif
extern struct file_system_type cifs_fs_type;
extern const struct address_space_operations cifs_addr_ops;
@@ -120,9 +116,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
#endif
/* Functions related to symlinks */
-extern const char *cifs_follow_link(struct dentry *direntry, void **cookie);
-extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
- int buflen);
+extern const char *cifs_get_link(struct dentry *, struct inode *,
+ struct delayed_call *);
extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
const char *symname);
extern int cifs_removexattr(struct dentry *, const char *);
@@ -131,7 +126,6 @@ extern int cifs_setxattr(struct dentry *, const char *, const void *,
extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
-
#ifdef CONFIG_CIFS_NFSD_EXPORT
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b406a32deb1f..f2cc0b3d1af7 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -70,8 +70,10 @@
#define SERVER_NAME_LENGTH 40
#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1)
-/* SMB echo "timeout" -- FIXME: tunable? */
-#define SMB_ECHO_INTERVAL (60 * HZ)
+/* echo interval in seconds */
+#define SMB_ECHO_INTERVAL_MIN 1
+#define SMB_ECHO_INTERVAL_MAX 600
+#define SMB_ECHO_INTERVAL_DEFAULT 60
#include "cifspdu.h"
@@ -225,7 +227,7 @@ struct smb_version_operations {
void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
/* verify the message */
- int (*check_message)(char *, unsigned int);
+ int (*check_message)(char *, unsigned int, struct TCP_Server_Info *);
bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
void (*downgrade_oplock)(struct TCP_Server_Info *,
struct cifsInodeInfo *, bool);
@@ -493,7 +495,10 @@ struct smb_vol {
bool mfsymlinks:1; /* use Minshall+French Symlinks */
bool multiuser:1;
bool rwpidforward:1; /* pid forward for read/write operations */
- bool nosharesock;
+ bool nosharesock:1;
+ bool persistent:1;
+ bool nopersistent:1;
+ bool resilient:1; /* noresilient not required since not fored for CA */
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -504,6 +509,7 @@ struct smb_vol {
struct sockaddr_storage dstaddr; /* destination address */
struct sockaddr_storage srcaddr; /* allow binding to a local IP */
struct nls_table *local_nls;
+ unsigned int echo_interval; /* echo interval in secs */
};
#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
@@ -624,7 +630,9 @@ struct TCP_Server_Info {
#ifdef CONFIG_CIFS_SMB2
unsigned int max_read;
unsigned int max_write;
+ __u8 preauth_hash[512];
#endif /* CONFIG_CIFS_SMB2 */
+ unsigned long echo_interval;
};
static inline unsigned int
@@ -706,7 +714,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb)
*
* Note that this might make for "interesting" allocation problems during
* writeback however as we have to allocate an array of pointers for the
- * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ * pages. A 16M write means ~32kb page array with PAGE_SIZE == 4096.
*
* For reads, there is a similar problem as we need to allocate an array
* of kvecs to handle the receive, though that should only need to be done
@@ -725,7 +733,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb)
/*
* The default wsize is 1M. find_get_pages seems to return a maximum of 256
- * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
+ * pages in a single call. With PAGE_SIZE == 4k, this means we can fill
* a single wsize request with a single call.
*/
#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
@@ -806,7 +814,10 @@ struct cifs_ses {
bool need_reconnect:1; /* connection reset, uid now invalid */
#ifdef CONFIG_CIFS_SMB2
__u16 session_flags;
- char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
+ __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
+ __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
+ __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
+ __u8 preauth_hash[512];
#endif /* CONFIG_CIFS_SMB2 */
};
@@ -895,6 +906,8 @@ struct cifs_tcon {
bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
bool broken_sparse_sup; /* if server or share does not support sparse */
bool need_reconnect:1; /* connection reset, tid now invalid */
+ bool use_resilient:1; /* use resilient instead of durable handles */
+ bool use_persistent:1; /* use persistent instead of durable handles */
#ifdef CONFIG_CIFS_SMB2
bool print:1; /* set if connection to printer share */
bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
@@ -1015,6 +1028,7 @@ struct cifs_fid {
__u64 persistent_fid; /* persist file id for smb2 */
__u64 volatile_fid; /* volatile file id for smb2 */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
+ __u8 create_guid[16];
#endif
struct cifs_pending_open *pending_open;
unsigned int epoch;
@@ -1582,11 +1596,11 @@ GLOBAL_EXTERN atomic_t midCount;
/* Misc globals */
GLOBAL_EXTERN bool enable_oplocks; /* enable or disable oplocks */
-GLOBAL_EXTERN unsigned int lookupCacheEnabled;
+GLOBAL_EXTERN bool lookupCacheEnabled;
GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
-GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
+GLOBAL_EXTERN bool linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */
GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index c63fd1dde25b..eed7ff50faf0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -102,7 +102,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *out_buf,
int *bytes_returned);
extern int cifs_reconnect(struct TCP_Server_Info *server);
-extern int checkSMB(char *buf, unsigned int length);
+extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr);
extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
extern bool backup_cred(struct cifs_sb_info *);
extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
@@ -439,7 +439,8 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
extern int calc_seckey(struct cifs_ses *);
-extern int generate_smb3signingkey(struct cifs_ses *);
+extern int generate_smb30signingkey(struct cifs_ses *);
+extern int generate_smb311signingkey(struct cifs_ses *);
#ifdef CONFIG_CIFS_WEAK_PW_HASH
extern int calc_lanman_hash(const char *password, const char *cryptkey,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 90b4f9f7de66..a894bf809ff7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1396,11 +1396,10 @@ openRetry:
* current bigbuf.
*/
static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+discard_remaining_data(struct TCP_Server_Info *server)
{
unsigned int rfclen = get_rfc1002_length(server->smallbuf);
int remaining = rfclen + 4 - server->total_read;
- struct cifs_readdata *rdata = mid->callback_data;
while (remaining > 0) {
int length;
@@ -1414,10 +1413,20 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
remaining -= length;
}
- dequeue_mid(mid, rdata->result);
return 0;
}
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+ int length;
+ struct cifs_readdata *rdata = mid->callback_data;
+
+ length = discard_remaining_data(server);
+ dequeue_mid(mid, rdata->result);
+ return length;
+}
+
int
cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
{
@@ -1446,6 +1455,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
return length;
server->total_read += length;
+ if (server->ops->is_status_pending &&
+ server->ops->is_status_pending(buf, server, 0)) {
+ discard_remaining_data(server);
+ return -1;
+ }
+
/* Was the SMB read successful? */
rdata->result = server->ops->map_error(buf, false);
if (rdata->result != 0) {
@@ -1914,17 +1929,17 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wsize = server->ops->wp_retry_size(inode);
if (wsize < rest_len) {
- nr_pages = wsize / PAGE_CACHE_SIZE;
+ nr_pages = wsize / PAGE_SIZE;
if (!nr_pages) {
rc = -ENOTSUPP;
break;
}
- cur_len = nr_pages * PAGE_CACHE_SIZE;
- tailsz = PAGE_CACHE_SIZE;
+ cur_len = nr_pages * PAGE_SIZE;
+ tailsz = PAGE_SIZE;
} else {
- nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE);
+ nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE);
cur_len = rest_len;
- tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE;
+ tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE;
}
wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
@@ -1942,7 +1957,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wdata2->sync_mode = wdata->sync_mode;
wdata2->nr_pages = nr_pages;
wdata2->offset = page_offset(wdata2->pages[0]);
- wdata2->pagesz = PAGE_CACHE_SIZE;
+ wdata2->pagesz = PAGE_SIZE;
wdata2->tailsz = tailsz;
wdata2->bytes = cur_len;
@@ -1960,7 +1975,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
if (rc != 0 && rc != -EAGAIN) {
SetPageError(wdata2->pages[j]);
end_page_writeback(wdata2->pages[j]);
- page_cache_release(wdata2->pages[j]);
+ put_page(wdata2->pages[j]);
}
}
@@ -2003,7 +2018,7 @@ cifs_writev_complete(struct work_struct *work)
else if (wdata->result < 0)
SetPageError(page);
end_page_writeback(page);
- page_cache_release(page);
+ put_page(page);
}
if (wdata->result != -EAGAIN)
mapping_set_error(inode->i_mapping, wdata->result);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3f2228570d44..6f62ac821a84 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -87,12 +87,15 @@ enum {
Opt_sign, Opt_seal, Opt_noac,
Opt_fsc, Opt_mfsymlinks,
Opt_multiuser, Opt_sloppy, Opt_nosharesock,
+ Opt_persistent, Opt_nopersistent,
+ Opt_resilient, Opt_noresilient,
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
Opt_cruid, Opt_gid, Opt_file_mode,
Opt_dirmode, Opt_port,
Opt_rsize, Opt_wsize, Opt_actimeo,
+ Opt_echo_interval,
/* Mount options which take string value */
Opt_user, Opt_pass, Opt_ip,
@@ -169,6 +172,10 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_multiuser, "multiuser" },
{ Opt_sloppy, "sloppy" },
{ Opt_nosharesock, "nosharesock" },
+ { Opt_persistent, "persistenthandles"},
+ { Opt_nopersistent, "nopersistenthandles"},
+ { Opt_resilient, "resilienthandles"},
+ { Opt_noresilient, "noresilienthandles"},
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -182,6 +189,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_rsize, "rsize=%s" },
{ Opt_wsize, "wsize=%s" },
{ Opt_actimeo, "actimeo=%s" },
+ { Opt_echo_interval, "echo_interval=%s" },
{ Opt_blank_user, "user=" },
{ Opt_blank_user, "username=" },
@@ -362,7 +370,6 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->session_key.response = NULL;
server->session_key.len = 0;
server->lstrp = jiffies;
- mutex_unlock(&server->srv_mutex);
/* mark submitted MIDs for retry and issue callback */
INIT_LIST_HEAD(&retry_list);
@@ -375,6 +382,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
list_move(&mid_entry->qhead, &retry_list);
}
spin_unlock(&GlobalMid_Lock);
+ mutex_unlock(&server->srv_mutex);
cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
list_for_each_safe(tmp, tmp2, &retry_list) {
@@ -412,6 +420,7 @@ cifs_echo_request(struct work_struct *work)
int rc;
struct TCP_Server_Info *server = container_of(work,
struct TCP_Server_Info, echo.work);
+ unsigned long echo_interval = server->echo_interval;
/*
* We cannot send an echo if it is disabled or until the
@@ -421,7 +430,7 @@ cifs_echo_request(struct work_struct *work)
*/
if (!server->ops->need_neg || server->ops->need_neg(server) ||
(server->ops->can_echo && !server->ops->can_echo(server)) ||
- time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+ time_before(jiffies, server->lstrp + echo_interval - HZ))
goto requeue_echo;
rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS;
@@ -430,7 +439,7 @@ cifs_echo_request(struct work_struct *work)
server->hostname);
requeue_echo:
- queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL);
+ queue_delayed_work(cifsiod_wq, &server->echo, echo_interval);
}
static bool
@@ -481,9 +490,9 @@ server_unresponsive(struct TCP_Server_Info *server)
* a response in >60s.
*/
if (server->tcpStatus == CifsGood &&
- time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
- cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n",
- server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ);
+ time_after(jiffies, server->lstrp + 2 * server->echo_interval)) {
+ cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n",
+ server->hostname, (2 * server->echo_interval) / HZ);
cifs_reconnect(server);
wake_up(&server->response_q);
return true;
@@ -822,7 +831,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
* 48 bytes is enough to display the header and a little bit
* into the payload for debugging purposes.
*/
- length = server->ops->check_message(buf, server->total_read);
+ length = server->ops->check_message(buf, server->total_read, server);
if (length != 0)
cifs_dump_mem("Bad SMB: ", buf,
min_t(unsigned int, server->total_read, 48));
@@ -1497,6 +1506,33 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_nosharesock:
vol->nosharesock = true;
break;
+ case Opt_nopersistent:
+ vol->nopersistent = true;
+ if (vol->persistent) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_persistent:
+ vol->persistent = true;
+ if ((vol->nopersistent) || (vol->resilient)) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_resilient:
+ vol->resilient = true;
+ if (vol->persistent) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_noresilient:
+ vol->resilient = false; /* already the default */
+ break;
/* Numeric Values */
case Opt_backupuid:
@@ -1591,6 +1627,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
break;
+ case Opt_echo_interval:
+ if (get_option_ul(args, &option)) {
+ cifs_dbg(VFS, "%s: Invalid echo interval value\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ vol->echo_interval = option;
+ break;
/* String Arguments */
@@ -2056,6 +2100,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
if (!match_security(server, vol))
return 0;
+ if (server->echo_interval != vol->echo_interval)
+ return 0;
+
return 1;
}
@@ -2175,6 +2222,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
tcp_ses->tcpStatus = CifsNew;
++tcp_ses->srv_count;
+ if (volume_info->echo_interval >= SMB_ECHO_INTERVAL_MIN &&
+ volume_info->echo_interval <= SMB_ECHO_INTERVAL_MAX)
+ tcp_ses->echo_interval = volume_info->echo_interval * HZ;
+ else
+ tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
+
rc = ip_connect(tcp_ses);
if (rc < 0) {
cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n");
@@ -2204,7 +2257,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
cifs_fscache_get_client_cookie(tcp_ses);
/* queue echo request delayed work */
- queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+ queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval);
return tcp_ses;
@@ -2655,6 +2708,42 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
}
tcon->seal = volume_info->seal;
+ tcon->use_persistent = false;
+ /* check if SMB2 or later, CIFS does not support persistent handles */
+ if (volume_info->persistent) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "SMB3 or later required for persistent handles\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+#ifdef CONFIG_CIFS_SMB2
+ } else if (ses->server->capabilities &
+ SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
+ tcon->use_persistent = true;
+ else /* persistent handles requested but not supported */ {
+ cifs_dbg(VFS,
+ "Persistent handles not supported on share\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+#endif /* CONFIG_CIFS_SMB2 */
+ }
+#ifdef CONFIG_CIFS_SMB2
+ } else if ((tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+ && (ses->server->capabilities & SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
+ && (volume_info->nopersistent == false)) {
+ cifs_dbg(FYI, "enabling persistent handles\n");
+ tcon->use_persistent = true;
+#endif /* CONFIG_CIFS_SMB2 */
+ } else if (volume_info->resilient) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "SMB2.1 or later required for resilient handles\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+ }
+ tcon->use_resilient = true;
+ }
+
/*
* We can have only one retry value for a connection to a share so for
* resources mounted more than once to the same server share the last
@@ -2910,8 +2999,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
if (ses_init_buf) {
ses_init_buf->trailer.session_req.called_len = 32;
- if (server->server_RFC1001_name &&
- server->server_RFC1001_name[0] != 0)
+ if (server->server_RFC1001_name[0] != 0)
rfc1002mangle(ses_init_buf->trailer.
session_req.called_name,
server->server_RFC1001_name,
@@ -3503,6 +3591,15 @@ try_mount_again:
goto mount_fail_check;
}
+#ifdef CONFIG_CIFS_SMB2
+ if ((volume_info->persistent == true) && ((ses->server->capabilities &
+ SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) {
+ cifs_dbg(VFS, "persistent handles not supported by server\n");
+ rc = -EOPNOTSUPP;
+ goto mount_fail_check;
+ }
+#endif /* CONFIG_CIFS_SMB2*/
+
/* search for existing tcon to this server share */
tcon = cifs_get_tcon(ses, volume_info);
if (IS_ERR(tcon)) {
@@ -3533,7 +3630,7 @@ try_mount_again:
cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
/* tune readahead according to rsize */
- cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
+ cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE;
remote_path_check:
#ifdef CONFIG_CIFS_DFS_UPCALL
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 47c5c97e2dd3..c03d0744648b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1833,7 +1833,7 @@ refind_writable:
static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
{
struct address_space *mapping = page->mapping;
- loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ loff_t offset = (loff_t)page->index << PAGE_SHIFT;
char *write_data;
int rc = -EFAULT;
int bytes_written = 0;
@@ -1849,7 +1849,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
write_data = kmap(page);
write_data += from;
- if ((to > PAGE_CACHE_SIZE) || (from > to)) {
+ if ((to > PAGE_SIZE) || (from > to)) {
kunmap(page);
return -EIO;
}
@@ -1902,7 +1902,7 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
* find_get_pages_tag seems to return a max of 256 on each
* iteration, so we must call it several times in order to
* fill the array or the wsize is effectively limited to
- * 256 * PAGE_CACHE_SIZE.
+ * 256 * PAGE_SIZE.
*/
*found_pages = 0;
pages = wdata->pages;
@@ -1991,7 +1991,7 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
/* put any pages we aren't going to use */
for (i = nr_pages; i < found_pages; i++) {
- page_cache_release(wdata->pages[i]);
+ put_page(wdata->pages[i]);
wdata->pages[i] = NULL;
}
@@ -2009,11 +2009,11 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
wdata->sync_mode = wbc->sync_mode;
wdata->nr_pages = nr_pages;
wdata->offset = page_offset(wdata->pages[0]);
- wdata->pagesz = PAGE_CACHE_SIZE;
+ wdata->pagesz = PAGE_SIZE;
wdata->tailsz = min(i_size_read(mapping->host) -
page_offset(wdata->pages[nr_pages - 1]),
- (loff_t)PAGE_CACHE_SIZE);
- wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz;
+ (loff_t)PAGE_SIZE);
+ wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz;
if (wdata->cfile != NULL)
cifsFileInfo_put(wdata->cfile);
@@ -2047,15 +2047,15 @@ static int cifs_writepages(struct address_space *mapping,
* If wsize is smaller than the page cache size, default to writing
* one page at a time via cifs_writepage
*/
- if (cifs_sb->wsize < PAGE_CACHE_SIZE)
+ if (cifs_sb->wsize < PAGE_SIZE)
return generic_writepages(mapping, wbc);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = true;
scanned = true;
@@ -2071,7 +2071,7 @@ retry:
if (rc)
break;
- tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
+ tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1;
wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
&found_pages);
@@ -2111,7 +2111,7 @@ retry:
else
SetPageError(wdata->pages[i]);
end_page_writeback(wdata->pages[i]);
- page_cache_release(wdata->pages[i]);
+ put_page(wdata->pages[i]);
}
if (rc != -EAGAIN)
mapping_set_error(mapping, rc);
@@ -2154,7 +2154,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
xid = get_xid();
/* BB add check for wbc flags */
- page_cache_get(page);
+ get_page(page);
if (!PageUptodate(page))
cifs_dbg(FYI, "ppw - page not up to date\n");
@@ -2170,7 +2170,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
*/
set_page_writeback(page);
retry_write:
- rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE);
+ rc = cifs_partialpagewrite(page, 0, PAGE_SIZE);
if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
goto retry_write;
else if (rc == -EAGAIN)
@@ -2180,7 +2180,7 @@ retry_write:
else
SetPageUptodate(page);
end_page_writeback(page);
- page_cache_release(page);
+ put_page(page);
free_xid(xid);
return rc;
}
@@ -2214,12 +2214,12 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
if (copied == len)
SetPageUptodate(page);
ClearPageChecked(page);
- } else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+ } else if (!PageUptodate(page) && copied == PAGE_SIZE)
SetPageUptodate(page);
if (!PageUptodate(page)) {
char *page_data;
- unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned offset = pos & (PAGE_SIZE - 1);
unsigned int xid;
xid = get_xid();
@@ -2248,7 +2248,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return rc;
}
@@ -2267,7 +2267,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (rc)
return rc;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
xid = get_xid();
@@ -2292,7 +2292,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
}
free_xid(xid);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return rc;
}
@@ -2309,7 +2309,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (rc)
return rc;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
xid = get_xid();
@@ -2326,7 +2326,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
free_xid(xid);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return rc;
}
@@ -2672,7 +2672,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
* with a brlock that prevents writing.
*/
down_read(&cinode->lock_sem);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
rc = generic_write_checks(iocb, from);
if (rc <= 0)
@@ -2685,7 +2685,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
else
rc = -EACCES;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (rc > 0) {
ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc);
@@ -3286,9 +3286,9 @@ cifs_readv_complete(struct work_struct *work)
(rdata->result == -EAGAIN && got_bytes))
cifs_readpage_to_fscache(rdata->mapping->host, page);
- got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes);
+ got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes);
- page_cache_release(page);
+ put_page(page);
rdata->pages[i] = NULL;
}
kref_put(&rdata->refcount, cifs_readdata_release);
@@ -3307,21 +3307,21 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
/* determine the eof that the server (probably) has */
eof = CIFS_I(rdata->mapping->host)->server_eof;
- eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+ eof_index = eof ? (eof - 1) >> PAGE_SHIFT : 0;
cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
rdata->got_bytes = 0;
- rdata->tailsz = PAGE_CACHE_SIZE;
+ rdata->tailsz = PAGE_SIZE;
for (i = 0; i < nr_pages; i++) {
struct page *page = rdata->pages[i];
- if (len >= PAGE_CACHE_SIZE) {
+ if (len >= PAGE_SIZE) {
/* enough data to fill the page */
iov.iov_base = kmap(page);
- iov.iov_len = PAGE_CACHE_SIZE;
+ iov.iov_len = PAGE_SIZE;
cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
i, page->index, iov.iov_base, iov.iov_len);
- len -= PAGE_CACHE_SIZE;
+ len -= PAGE_SIZE;
} else if (len > 0) {
/* enough for partial page, fill and zero the rest */
iov.iov_base = kmap(page);
@@ -3329,7 +3329,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
i, page->index, iov.iov_base, iov.iov_len);
memset(iov.iov_base + len,
- '\0', PAGE_CACHE_SIZE - len);
+ '\0', PAGE_SIZE - len);
rdata->tailsz = len;
len = 0;
} else if (page->index > eof_index) {
@@ -3341,12 +3341,12 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
* to prevent the VFS from repeatedly attempting to
* fill them until the writes are flushed.
*/
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
lru_cache_add_file(page);
flush_dcache_page(page);
SetPageUptodate(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
rdata->pages[i] = NULL;
rdata->nr_pages--;
continue;
@@ -3354,7 +3354,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
/* no need to hold page hostage */
lru_cache_add_file(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
rdata->pages[i] = NULL;
rdata->nr_pages--;
continue;
@@ -3380,7 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
struct page *page, *tpage;
unsigned int expected_index;
int rc;
- gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
INIT_LIST_HEAD(tmplist);
@@ -3391,19 +3391,19 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
* should have access to this page, we're safe to simply set
* PG_locked without checking it first.
*/
- __set_page_locked(page);
+ __SetPageLocked(page);
rc = add_to_page_cache_locked(page, mapping,
page->index, gfp);
/* give up if we can't stick it in the cache */
if (rc) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
return rc;
}
/* move first page to the tmplist */
- *offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
- *bytes = PAGE_CACHE_SIZE;
+ *offset = (loff_t)page->index << PAGE_SHIFT;
+ *bytes = PAGE_SIZE;
*nr_pages = 1;
list_move_tail(&page->lru, tmplist);
@@ -3415,16 +3415,16 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
break;
/* would this page push the read over the rsize? */
- if (*bytes + PAGE_CACHE_SIZE > rsize)
+ if (*bytes + PAGE_SIZE > rsize)
break;
- __set_page_locked(page);
+ __SetPageLocked(page);
if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
- __clear_page_locked(page);
+ __ClearPageLocked(page);
break;
}
list_move_tail(&page->lru, tmplist);
- (*bytes) += PAGE_CACHE_SIZE;
+ (*bytes) += PAGE_SIZE;
expected_index++;
(*nr_pages)++;
}
@@ -3493,7 +3493,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
* reach this point however since we set ra_pages to 0 when the
* rsize is smaller than a cache page.
*/
- if (unlikely(rsize < PAGE_CACHE_SIZE)) {
+ if (unlikely(rsize < PAGE_SIZE)) {
add_credits_and_wake_if(server, credits, 0);
return 0;
}
@@ -3512,7 +3512,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
list_del(&page->lru);
lru_cache_add_file(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
rc = -ENOMEM;
add_credits_and_wake_if(server, credits, 0);
@@ -3524,7 +3524,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata->offset = offset;
rdata->bytes = bytes;
rdata->pid = pid;
- rdata->pagesz = PAGE_CACHE_SIZE;
+ rdata->pagesz = PAGE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
rdata->credits = credits;
@@ -3542,7 +3542,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
page = rdata->pages[i];
lru_cache_add_file(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
/* Fallback to the readpage in error/reconnect cases */
kref_put(&rdata->refcount, cifs_readdata_release);
@@ -3577,7 +3577,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
read_data = kmap(page);
/* for reads over a certain size could initiate async read ahead */
- rc = cifs_read(file, read_data, PAGE_CACHE_SIZE, poffset);
+ rc = cifs_read(file, read_data, PAGE_SIZE, poffset);
if (rc < 0)
goto io_error;
@@ -3587,8 +3587,8 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
file_inode(file)->i_atime =
current_fs_time(file_inode(file)->i_sb);
- if (PAGE_CACHE_SIZE > rc)
- memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc);
+ if (PAGE_SIZE > rc)
+ memset(read_data + rc, 0, PAGE_SIZE - rc);
flush_dcache_page(page);
SetPageUptodate(page);
@@ -3608,7 +3608,7 @@ read_complete:
static int cifs_readpage(struct file *file, struct page *page)
{
- loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ loff_t offset = (loff_t)page->index << PAGE_SHIFT;
int rc = -EACCES;
unsigned int xid;
@@ -3679,8 +3679,8 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
int oncethru = 0;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+ pgoff_t index = pos >> PAGE_SHIFT;
+ loff_t offset = pos & (PAGE_SIZE - 1);
loff_t page_start = pos & PAGE_MASK;
loff_t i_size;
struct page *page;
@@ -3703,7 +3703,7 @@ start:
* the server. If the write is short, we'll end up doing a sync write
* instead.
*/
- if (len == PAGE_CACHE_SIZE)
+ if (len == PAGE_SIZE)
goto out;
/*
@@ -3718,7 +3718,7 @@ start:
(offset == 0 && (pos + len) >= i_size)) {
zero_user_segments(page, 0, offset,
offset + len,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
/*
* PageChecked means that the parts of the page
* to which we're not writing are considered up
@@ -3737,7 +3737,7 @@ start:
* do a sync write instead since PG_uptodate isn't set.
*/
cifs_readpage_worker(file, page, &page_start);
- page_cache_release(page);
+ put_page(page);
oncethru = 1;
goto start;
} else {
@@ -3764,7 +3764,7 @@ static void cifs_invalidate_page(struct page *page, unsigned int offset,
{
struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
- if (offset == 0 && length == PAGE_CACHE_SIZE)
+ if (offset == 0 && length == PAGE_SIZE)
cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
}
@@ -3772,7 +3772,7 @@ static int cifs_launder_page(struct page *page)
{
int rc = 0;
loff_t range_start = page_offset(page);
- loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+ loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6b66dd5d1540..5f9ad5c42180 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -59,7 +59,7 @@ static void cifs_set_ops(struct inode *inode)
/* check if server can support readpages */
if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
- PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
+ PAGE_SIZE + MAX_CIFS_HDR_SIZE)
inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
else
inode->i_data.a_ops = &cifs_addr_ops;
@@ -814,8 +814,21 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
}
} else
fattr.cf_uniqueid = iunique(sb, ROOT_I);
- } else
- fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ } else {
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+ validinum == false && server->ops->get_srv_inum) {
+ /*
+ * Pass a NULL tcon to ensure we don't make a round
+ * trip to the server. This only works for SMB2+.
+ */
+ tmprc = server->ops->get_srv_inum(xid,
+ NULL, cifs_sb, full_path,
+ &fattr.cf_uniqueid, data);
+ if (tmprc)
+ fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ } else
+ fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+ }
/* query for SFU type info if supported and needed */
if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
@@ -856,6 +869,13 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
} else {
/* we already have inode, update it */
+ /* if uniqueid is different, return error */
+ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
+ CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
+ rc = -ESTALE;
+ goto cgii_exit;
+ }
+
/* if filetype is different, return error */
if (unlikely(((*inode)->i_mode & S_IFMT) !=
(fattr.cf_mode & S_IFMT))) {
@@ -1831,11 +1851,11 @@ cifs_invalidate_mapping(struct inode *inode)
* @word: long word containing the bit lock
*/
static int
-cifs_wait_bit_killable(struct wait_bit_key *key)
+cifs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
freezable_schedule_unsafe();
+ if (signal_pending_state(mode, current))
+ return -ERESTARTSYS;
return 0;
}
@@ -1999,8 +2019,8 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
static int cifs_truncate_page(struct address_space *mapping, loff_t from)
{
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+ pgoff_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE - 1);
struct page *page;
int rc = 0;
@@ -2008,9 +2028,9 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
if (!page)
return -ENOMEM;
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return rc;
}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 28a77bf1d559..7a3b84e300f8 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -34,68 +34,36 @@
#include "cifs_ioctl.h"
#include <linux/btrfs.h>
-static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
- unsigned long srcfd, u64 off, u64 len, u64 destoff,
- bool dup_extents)
+static int cifs_file_clone_range(unsigned int xid, struct file *src_file,
+ struct file *dst_file)
{
- int rc;
- struct cifsFileInfo *smb_file_target = dst_file->private_data;
+ struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
- struct cifs_tcon *target_tcon;
- struct fd src_file;
struct cifsFileInfo *smb_file_src;
- struct inode *src_inode;
+ struct cifsFileInfo *smb_file_target;
struct cifs_tcon *src_tcon;
+ struct cifs_tcon *target_tcon;
+ int rc;
cifs_dbg(FYI, "ioctl clone range\n");
- /* the destination must be opened for writing */
- if (!(dst_file->f_mode & FMODE_WRITE)) {
- cifs_dbg(FYI, "file target not open for write\n");
- return -EINVAL;
- }
-
- /* check if target volume is readonly and take reference */
- rc = mnt_want_write_file(dst_file);
- if (rc) {
- cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
- return rc;
- }
- src_file = fdget(srcfd);
- if (!src_file.file) {
- rc = -EBADF;
- goto out_drop_write;
- }
-
- if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
- rc = -EBADF;
- cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
- goto out_fput;
- }
-
- if ((!src_file.file->private_data) || (!dst_file->private_data)) {
+ if (!src_file->private_data || !dst_file->private_data) {
rc = -EBADF;
cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
- goto out_fput;
+ goto out;
}
rc = -EXDEV;
smb_file_target = dst_file->private_data;
- smb_file_src = src_file.file->private_data;
+ smb_file_src = src_file->private_data;
src_tcon = tlink_tcon(smb_file_src->tlink);
target_tcon = tlink_tcon(smb_file_target->tlink);
- /* check if source and target are on same tree connection */
- if (src_tcon != target_tcon) {
- cifs_dbg(VFS, "file copy src and target on different volume\n");
- goto out_fput;
+ if (src_tcon->ses != target_tcon->ses) {
+ cifs_dbg(VFS, "source and target of copy not on same server\n");
+ goto out;
}
- src_inode = file_inode(src_file.file);
- rc = -EINVAL;
- if (S_ISDIR(src_inode->i_mode))
- goto out_fput;
-
/*
* Note: cifs case is easier than btrfs since server responsible for
* checks for proper open modes and file type and if it wants
@@ -103,34 +71,66 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
*/
lock_two_nondirectories(target_inode, src_inode);
- /* determine range to clone */
- rc = -EINVAL;
- if (off + len > src_inode->i_size || off + len < off)
- goto out_unlock;
- if (len == 0)
- len = src_inode->i_size - off;
-
cifs_dbg(FYI, "about to flush pages\n");
/* should we flush first and last page first */
- truncate_inode_pages_range(&target_inode->i_data, destoff,
- PAGE_CACHE_ALIGN(destoff + len)-1);
+ truncate_inode_pages(&target_inode->i_data, 0);
- if (dup_extents && target_tcon->ses->server->ops->duplicate_extents)
- rc = target_tcon->ses->server->ops->duplicate_extents(xid,
- smb_file_src, smb_file_target, off, len, destoff);
- else if (!dup_extents && target_tcon->ses->server->ops->clone_range)
+ if (target_tcon->ses->server->ops->clone_range)
rc = target_tcon->ses->server->ops->clone_range(xid,
- smb_file_src, smb_file_target, off, len, destoff);
+ smb_file_src, smb_file_target, 0, src_inode->i_size, 0);
else
rc = -EOPNOTSUPP;
/* force revalidate of size and timestamps of target file now
that target is updated on the server */
CIFS_I(target_inode)->time = 0;
-out_unlock:
/* although unlocking in the reverse order from locking is not
strictly necessary here it is a little cleaner to be consistent */
unlock_two_nondirectories(src_inode, target_inode);
+out:
+ return rc;
+}
+
+static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
+ unsigned long srcfd)
+{
+ int rc;
+ struct fd src_file;
+ struct inode *src_inode;
+
+ cifs_dbg(FYI, "ioctl clone range\n");
+ /* the destination must be opened for writing */
+ if (!(dst_file->f_mode & FMODE_WRITE)) {
+ cifs_dbg(FYI, "file target not open for write\n");
+ return -EINVAL;
+ }
+
+ /* check if target volume is readonly and take reference */
+ rc = mnt_want_write_file(dst_file);
+ if (rc) {
+ cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
+ return rc;
+ }
+
+ src_file = fdget(srcfd);
+ if (!src_file.file) {
+ rc = -EBADF;
+ goto out_drop_write;
+ }
+
+ if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+ rc = -EBADF;
+ cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
+ goto out_fput;
+ }
+
+ src_inode = file_inode(src_file.file);
+ rc = -EINVAL;
+ if (S_ISDIR(src_inode->i_mode))
+ goto out_fput;
+
+ rc = cifs_file_clone_range(xid, src_file.file, dst_file);
+
out_fput:
fdput(src_file);
out_drop_write:
@@ -251,10 +251,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
}
break;
case CIFS_IOC_COPYCHUNK_FILE:
- rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false);
- break;
- case BTRFS_IOC_CLONE:
- rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true);
+ rc = cifs_ioctl_clone(xid, filep, arg);
break;
case CIFS_IOC_SET_INTEGRITY:
if (pSMBFile == NULL)
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index e3548f73bdea..062c2375549a 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -627,9 +627,9 @@ cifs_hl_exit:
}
const char *
-cifs_follow_link(struct dentry *direntry, void **cookie)
+cifs_get_link(struct dentry *direntry, struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(direntry);
int rc = -ENOMEM;
unsigned int xid;
char *full_path = NULL;
@@ -639,6 +639,9 @@ cifs_follow_link(struct dentry *direntry, void **cookie)
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
+ if (!direntry)
+ return ERR_PTR(-ECHILD);
+
xid = get_xid();
tlink = cifs_sb_tlink(cifs_sb);
@@ -678,7 +681,8 @@ cifs_follow_link(struct dentry *direntry, void **cookie)
kfree(target_path);
return ERR_PTR(rc);
}
- return *cookie = target_path;
+ set_delayed_call(done, kfree_link, target_path);
+ return target_path;
}
int
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 8442b8b8e0be..813fe13c2ae1 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -310,7 +310,7 @@ check_smb_hdr(struct smb_hdr *smb)
}
int
-checkSMB(char *buf, unsigned int total_read)
+checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
{
struct smb_hdr *smb = (struct smb_hdr *)buf;
__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 0557c45e9c33..b30a4a6d98a0 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -847,6 +847,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
* if buggy server returns . and .. late do we want to
* check for that here?
*/
+ *tmp_buf = 0;
rc = cifs_filldir(current_entry, file, ctx,
tmp_buf, max_len);
if (rc) {
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 2ab297dae5a7..f9e766f464be 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -43,6 +43,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
struct smb2_file_all_info *smb2_data = NULL;
__u8 smb2_oplock[17];
struct cifs_fid *fid = oparms->fid;
+ struct network_resiliency_req nr_ioctl_req;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL) {
@@ -67,6 +68,24 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
if (rc)
goto out;
+
+ if (oparms->tcon->use_resilient) {
+ nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */
+ nr_ioctl_req.Reserved = 0;
+ rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
+ fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, true,
+ (char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
+ NULL, NULL /* no return info */);
+ if (rc == -EOPNOTSUPP) {
+ cifs_dbg(VFS,
+ "resiliency not supported by server, disabling\n");
+ oparms->tcon->use_resilient = false;
+ } else if (rc)
+ cifs_dbg(FYI, "error %d setting resiliency\n", rc);
+
+ rc = 0;
+ }
+
if (buf) {
/* open response does not have IndexNumber field - get it */
rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 1c5907019045..389fb9f8c84e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -38,7 +38,7 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
* Make sure that this really is an SMB, that it is a response,
* and that the message ids match.
*/
- if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
+ if ((hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
(mid == wire_mid)) {
if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
return 0;
@@ -50,9 +50,9 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
cifs_dbg(VFS, "Received Request not response\n");
}
} else { /* bad signature or mid */
- if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
+ if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
cifs_dbg(VFS, "Bad protocol string signature header %x\n",
- *(unsigned int *) hdr->ProtocolId);
+ le32_to_cpu(hdr->ProtocolId));
if (mid != wire_mid)
cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
mid, wire_mid);
@@ -93,11 +93,11 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
};
int
-smb2_check_message(char *buf, unsigned int length)
+smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
{
struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
- __u64 mid = le64_to_cpu(hdr->MessageId);
+ __u64 mid;
__u32 len = get_rfc1002_length(buf);
__u32 clc_len; /* calculated length */
int command;
@@ -111,6 +111,30 @@ smb2_check_message(char *buf, unsigned int length)
* ie Validate the wct via smb2_struct_sizes table above
*/
+ if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+ struct smb2_transform_hdr *thdr =
+ (struct smb2_transform_hdr *)buf;
+ struct cifs_ses *ses = NULL;
+ struct list_head *tmp;
+
+ /* decrypt frame now that it is completely read in */
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each(tmp, &srvr->smb_ses_list) {
+ ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+ if (ses->Suid == thdr->SessionId)
+ break;
+
+ ses = NULL;
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+ if (ses == NULL) {
+ cifs_dbg(VFS, "no decryption - session id not found\n");
+ return 1;
+ }
+ }
+
+
+ mid = le64_to_cpu(hdr->MessageId);
if (length < sizeof(struct smb2_pdu)) {
if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) {
pdu->StructureSize2 = 0;
@@ -322,7 +346,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
/* return pointer to beginning of data area, ie offset from SMB start */
if ((*off != 0) && (*len != 0))
- return (char *)(&hdr->ProtocolId[0]) + *off;
+ return (char *)(&hdr->ProtocolId) + *off;
else
return NULL;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 18da19f4f811..3525ed756173 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -182,6 +182,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
__u64 wire_mid = le64_to_cpu(hdr->MessageId);
+ if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+ cifs_dbg(VFS, "encrypted frame parsing not supported yet");
+ return NULL;
+ }
+
spin_lock(&GlobalMid_Lock);
list_for_each_entry(mid, &server->pending_mid_q, qhead) {
if ((mid->mid == wire_mid) &&
@@ -810,7 +815,6 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
cfile->fid.volatile_fid, cfile->pid, &eof, false);
}
-#ifdef CONFIG_CIFS_SMB311
static int
smb2_duplicate_extents(const unsigned int xid,
struct cifsFileInfo *srcfile,
@@ -854,8 +858,6 @@ smb2_duplicate_extents(const unsigned int xid,
duplicate_extents_out:
return rc;
}
-#endif /* CONFIG_CIFS_SMB311 */
-
static int
smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
@@ -1695,7 +1697,7 @@ struct smb_version_operations smb30_operations = {
.get_lease_key = smb2_get_lease_key,
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
- .generate_signingkey = generate_smb3signingkey,
+ .generate_signingkey = generate_smb30signingkey,
.calc_signature = smb3_calc_signature,
.set_integrity = smb3_set_integrity,
.is_read_op = smb21_is_read_op,
@@ -1703,6 +1705,7 @@ struct smb_version_operations smb30_operations = {
.create_lease_buf = smb3_create_lease_buf,
.parse_lease_buf = smb3_parse_lease_buf,
.clone_range = smb2_clone_range,
+ .duplicate_extents = smb2_duplicate_extents,
.validate_negotiate = smb3_validate_negotiate,
.wp_retry_size = smb2_wp_retry_size,
.dir_needs_close = smb2_dir_needs_close,
@@ -1781,7 +1784,7 @@ struct smb_version_operations smb311_operations = {
.get_lease_key = smb2_get_lease_key,
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
- .generate_signingkey = generate_smb3signingkey,
+ .generate_signingkey = generate_smb311signingkey,
.calc_signature = smb3_calc_signature,
.set_integrity = smb3_set_integrity,
.is_read_op = smb21_is_read_op,
@@ -1840,7 +1843,7 @@ struct smb_version_values smb21_values = {
struct smb_version_values smb30_values = {
.version_string = SMB30_VERSION_STRING,
.protocol_id = SMB30_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1860,7 +1863,7 @@ struct smb_version_values smb30_values = {
struct smb_version_values smb302_values = {
.version_string = SMB302_VERSION_STRING,
.protocol_id = SMB302_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1881,7 +1884,7 @@ struct smb_version_values smb302_values = {
struct smb_version_values smb311_values = {
.version_string = SMB311_VERSION_STRING,
.protocol_id = SMB311_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 61276929d139..42e1f440eb1e 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -97,10 +97,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr)
- 4 /* RFC 1001 length field itself not counted */);
- hdr->ProtocolId[0] = 0xFE;
- hdr->ProtocolId[1] = 'S';
- hdr->ProtocolId[2] = 'M';
- hdr->ProtocolId[3] = 'B';
+ hdr->ProtocolId = SMB2_PROTO_NUMBER;
hdr->StructureSize = cpu_to_le16(64);
hdr->Command = smb2_cmd;
hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */
@@ -1109,21 +1106,25 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
{
char *data_offset;
struct create_context *cc;
- unsigned int next = 0;
+ unsigned int next;
+ unsigned int remaining;
char *name;
data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
+ remaining = le32_to_cpu(rsp->CreateContextsLength);
cc = (struct create_context *)data_offset;
- do {
- cc = (struct create_context *)((char *)cc + next);
+ while (remaining >= sizeof(struct create_context)) {
name = le16_to_cpu(cc->NameOffset) + (char *)cc;
- if (le16_to_cpu(cc->NameLength) != 4 ||
- strncmp(name, "RqLs", 4)) {
- next = le32_to_cpu(cc->Next);
- continue;
- }
- return server->ops->parse_lease_buf(cc, epoch);
- } while (next != 0);
+ if (le16_to_cpu(cc->NameLength) == 4 &&
+ strncmp(name, "RqLs", 4) == 0)
+ return server->ops->parse_lease_buf(cc, epoch);
+
+ next = le32_to_cpu(cc->Next);
+ if (!next)
+ break;
+ remaining -= next;
+ cc = (struct create_context *)((char *)cc + next);
+ }
return 0;
}
@@ -1151,13 +1152,130 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
return 0;
}
+static struct create_durable_v2 *
+create_durable_v2_buf(struct cifs_fid *pfid)
+{
+ struct create_durable_v2 *buf;
+
+ buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_durable_v2, dcontext));
+ buf->ccontext.DataLength = cpu_to_le32(sizeof(struct durable_context_v2));
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_durable_v2, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+
+ buf->dcontext.Timeout = 0; /* Should this be configurable by workload */
+ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
+ get_random_bytes(buf->dcontext.CreateGuid, 16);
+ memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+
+ /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = '2';
+ buf->Name[3] = 'Q';
+ return buf;
+}
+
+static struct create_durable_handle_reconnect_v2 *
+create_reconnect_durable_v2_buf(struct cifs_fid *fid)
+{
+ struct create_durable_handle_reconnect_v2 *buf;
+
+ buf = kzalloc(sizeof(struct create_durable_handle_reconnect_v2),
+ GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset =
+ cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2,
+ dcontext));
+ buf->ccontext.DataLength =
+ cpu_to_le32(sizeof(struct durable_reconnect_context_v2));
+ buf->ccontext.NameOffset =
+ cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2,
+ Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+
+ buf->dcontext.Fid.PersistentFileId = fid->persistent_fid;
+ buf->dcontext.Fid.VolatileFileId = fid->volatile_fid;
+ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
+ memcpy(buf->dcontext.CreateGuid, fid->create_guid, 16);
+
+ /* SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 is "DH2C" */
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = '2';
+ buf->Name[3] = 'C';
+ return buf;
+}
+
static int
-add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec,
struct cifs_open_parms *oparms)
{
struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
+ iov[num].iov_base = create_durable_v2_buf(oparms->fid);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_durable_v2);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset =
+ cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ iov[1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2));
+ inc_rfc1001_len(&req->hdr, sizeof(struct create_durable_v2));
+ *num_iovec = num + 1;
+ return 0;
+}
+
+static int
+add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec,
+ struct cifs_open_parms *oparms)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ /* indicate that we don't need to relock the file */
+ oparms->reconnect = false;
+
+ iov[num].iov_base = create_reconnect_durable_v2_buf(oparms->fid);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset =
+ cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ iov[1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength,
+ sizeof(struct create_durable_handle_reconnect_v2));
+ inc_rfc1001_len(&req->hdr,
+ sizeof(struct create_durable_handle_reconnect_v2));
+ *num_iovec = num + 1;
+ return 0;
+}
+
+static int
+add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+ struct cifs_open_parms *oparms, bool use_persistent)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ if (use_persistent) {
+ if (oparms->reconnect)
+ return add_durable_reconnect_v2_context(iov, num_iovec,
+ oparms);
+ else
+ return add_durable_v2_context(iov, num_iovec, oparms);
+ }
+
if (oparms->reconnect) {
iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
/* indicate that we don't need to relock the file */
@@ -1275,7 +1393,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
ccontext->Next =
cpu_to_le32(server->vals->create_lease_size);
}
- rc = add_durable_context(iov, &num_iovecs, oparms);
+
+ rc = add_durable_context(iov, &num_iovecs, oparms,
+ tcon->use_persistent);
if (rc) {
cifs_small_buf_release(req);
kfree(copy_path);
@@ -1454,7 +1574,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
goto ioctl_exit;
}
- memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
+ memcpy(*out_data,
+ (char *)&rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
*plen);
ioctl_exit:
free_rsp_buf(resp_buftype, rsp);
@@ -1974,7 +2095,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
}
if (*buf) {
- memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset,
+ memcpy(*buf, (char *)&rsp->hdr.ProtocolId + rsp->DataOffset,
*nbytes);
free_rsp_buf(resp_buftype, iov[0].iov_base);
} else if (resp_buftype != CIFS_NO_BUFFER) {
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 451108284a2f..ff88d9feb01e 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -86,6 +86,7 @@
#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
+#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
/*
* SMB2 Header Definition
@@ -102,7 +103,7 @@ struct smb2_hdr {
__be32 smb2_buf_length; /* big endian on wire */
/* length is only two or three bytes - with
one or two byte type preceding it that MBZ */
- __u8 ProtocolId[4]; /* 0xFE 'S' 'M' 'B' */
+ __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
__le16 StructureSize; /* 64 */
__le16 CreditCharge; /* MBZ */
__le32 Status; /* Error from server */
@@ -128,11 +129,10 @@ struct smb2_transform_hdr {
one or two byte type preceding it that MBZ */
__u8 ProtocolId[4]; /* 0xFD 'S' 'M' 'B' */
__u8 Signature[16];
- __u8 Nonce[11];
- __u8 Reserved[5];
+ __u8 Nonce[16];
__le32 OriginalMessageSize;
__u16 Reserved1;
- __le16 EncryptionAlgorithm;
+ __le16 Flags; /* EncryptionAlgorithm */
__u64 SessionId;
} __packed;
@@ -590,6 +590,44 @@ struct create_durable {
} Data;
} __packed;
+/* See MS-SMB2 2.2.13.2.11 */
+/* Flags */
+#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002
+struct durable_context_v2 {
+ __le32 Timeout;
+ __le32 Flags;
+ __u64 Reserved;
+ __u8 CreateGuid[16];
+} __packed;
+
+struct create_durable_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct durable_context_v2 dcontext;
+} __packed;
+
+/* See MS-SMB2 2.2.13.2.12 */
+struct durable_reconnect_context_v2 {
+ struct {
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ } Fid;
+ __u8 CreateGuid[16];
+ __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
+} __packed;
+
+/* See MS-SMB2 2.2.14.2.12 */
+struct durable_reconnect_context_v2_rsp {
+ __le32 Timeout;
+ __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
+} __packed;
+
+struct create_durable_handle_reconnect_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct durable_reconnect_context_v2 dcontext;
+} __packed;
+
#define COPY_CHUNK_RES_KEY_SIZE 24
struct resume_key_req {
char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
@@ -643,6 +681,13 @@ struct fsctl_get_integrity_information_rsp {
/* Integrity flags for above */
#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
+/* See MS-SMB2 2.2.31.3 */
+struct network_resiliency_req {
+ __le32 Timeout;
+ __le32 Reserved;
+} __packed;
+/* There is no buffer for the response ie no struct network_resiliency_rsp */
+
struct validate_negotiate_info_req {
__le32 Capabilities;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 79dc650c18b2..4f07dc93608d 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -34,7 +34,8 @@ struct smb_rqst;
*****************************************************************
*/
extern int map_smb2_to_linux_error(char *buf, bool log_err);
-extern int smb2_check_message(char *buf, unsigned int length);
+extern int smb2_check_message(char *buf, unsigned int length,
+ struct TCP_Server_Info *server);
extern unsigned int smb2_calc_size(void *buf);
extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr);
extern __le16 *cifs_convert_path_to_utf16(const char *from,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index d4c5b6f109a7..8732a43b1008 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -222,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return rc;
}
-int
-generate_smb3signingkey(struct cifs_ses *ses)
+static int generate_key(struct cifs_ses *ses, struct kvec label,
+ struct kvec context, __u8 *key, unsigned int key_size)
{
unsigned char zero = 0x0;
__u8 i[4] = {0, 0, 0, 1};
@@ -233,7 +233,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
unsigned char *hashptr = prfhash;
memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
- memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+ memset(key, 0x0, key_size);
rc = smb3_crypto_shash_allocate(ses->server);
if (rc) {
@@ -262,7 +262,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
}
rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
- "SMB2AESCMAC", 12);
+ label.iov_base, label.iov_len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
goto smb3signkey_ret;
@@ -276,7 +276,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
}
rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
- "SmbSign", 8);
+ context.iov_base, context.iov_len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
goto smb3signkey_ret;
@@ -296,12 +296,102 @@ generate_smb3signingkey(struct cifs_ses *ses)
goto smb3signkey_ret;
}
- memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+ memcpy(key, hashptr, key_size);
smb3signkey_ret:
return rc;
}
+struct derivation {
+ struct kvec label;
+ struct kvec context;
+};
+
+struct derivation_triplet {
+ struct derivation signing;
+ struct derivation encryption;
+ struct derivation decryption;
+};
+
+static int
+generate_smb3signingkey(struct cifs_ses *ses,
+ const struct derivation_triplet *ptriplet)
+{
+ int rc;
+
+ rc = generate_key(ses, ptriplet->signing.label,
+ ptriplet->signing.context, ses->smb3signingkey,
+ SMB3_SIGN_KEY_SIZE);
+ if (rc)
+ return rc;
+
+ rc = generate_key(ses, ptriplet->encryption.label,
+ ptriplet->encryption.context, ses->smb3encryptionkey,
+ SMB3_SIGN_KEY_SIZE);
+ if (rc)
+ return rc;
+
+ return generate_key(ses, ptriplet->decryption.label,
+ ptriplet->decryption.context,
+ ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+}
+
+int
+generate_smb30signingkey(struct cifs_ses *ses)
+
+{
+ struct derivation_triplet triplet;
+ struct derivation *d;
+
+ d = &triplet.signing;
+ d->label.iov_base = "SMB2AESCMAC";
+ d->label.iov_len = 12;
+ d->context.iov_base = "SmbSign";
+ d->context.iov_len = 8;
+
+ d = &triplet.encryption;
+ d->label.iov_base = "SMB2AESCCM";
+ d->label.iov_len = 11;
+ d->context.iov_base = "ServerIn ";
+ d->context.iov_len = 10;
+
+ d = &triplet.decryption;
+ d->label.iov_base = "SMB2AESCCM";
+ d->label.iov_len = 11;
+ d->context.iov_base = "ServerOut";
+ d->context.iov_len = 10;
+
+ return generate_smb3signingkey(ses, &triplet);
+}
+
+int
+generate_smb311signingkey(struct cifs_ses *ses)
+
+{
+ struct derivation_triplet triplet;
+ struct derivation *d;
+
+ d = &triplet.signing;
+ d->label.iov_base = "SMB2AESCMAC";
+ d->label.iov_len = 12;
+ d->context.iov_base = "SmbSign";
+ d->context.iov_len = 8;
+
+ d = &triplet.encryption;
+ d->label.iov_base = "SMB2AESCCM";
+ d->label.iov_len = 11;
+ d->context.iov_base = "ServerIn ";
+ d->context.iov_len = 10;
+
+ d = &triplet.decryption;
+ d->label.iov_base = "SMB2AESCCM";
+ d->label.iov_len = 11;
+ d->context.iov_base = "ServerOut";
+ d->context.iov_len = 10;
+
+ return generate_smb3signingkey(ses, &triplet);
+}
+
int
smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index a4232ec4f2ba..699b7868108f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -23,6 +23,7 @@
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
+#include <crypto/skcipher.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/fs.h>
@@ -70,31 +71,42 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
{
int rc;
unsigned char key2[8];
- struct crypto_blkcipher *tfm_des;
+ struct crypto_skcipher *tfm_des;
struct scatterlist sgin, sgout;
- struct blkcipher_desc desc;
+ struct skcipher_request *req;
str_to_key(key, key2);
- tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+ tfm_des = crypto_alloc_skcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_des)) {
rc = PTR_ERR(tfm_des);
cifs_dbg(VFS, "could not allocate des crypto API\n");
goto smbhash_err;
}
- desc.tfm = tfm_des;
+ req = skcipher_request_alloc(tfm_des, GFP_KERNEL);
+ if (!req) {
+ rc = -ENOMEM;
+ cifs_dbg(VFS, "could not allocate des crypto API\n");
+ goto smbhash_free_skcipher;
+ }
- crypto_blkcipher_setkey(tfm_des, key2, 8);
+ crypto_skcipher_setkey(tfm_des, key2, 8);
sg_init_one(&sgin, in, 8);
sg_init_one(&sgout, out, 8);
- rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sgin, &sgout, 8, NULL);
+
+ rc = crypto_skcipher_encrypt(req);
if (rc)
cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc);
- crypto_free_blkcipher(tfm_des);
+ skcipher_request_free(req);
+
+smbhash_free_skcipher:
+ crypto_free_skcipher(tfm_des);
smbhash_err:
return rc;
}
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index a639d0dab453..f996daeea271 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -90,7 +90,7 @@
#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
/* Retrieve an opaque file reference for server-side data movement ie copy */
#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
-#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4
#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 2a24c524fb9a..87abe8ed074c 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -576,14 +576,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
cifs_in_send_dec(server);
cifs_save_when_sent(mid);
- if (rc < 0)
+ if (rc < 0) {
server->sequence_number -= 2;
+ cifs_delete_mid(mid);
+ }
+
mutex_unlock(&server->srv_mutex);
if (rc == 0)
return 0;
- cifs_delete_mid(mid);
add_credits_and_wake_if(server, credits, optype);
return rc;
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index ff9e1f8b16a4..f5dc2f0df4ad 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -190,8 +190,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
#endif /* CONFIG_CIFS_ACL */
} else {
int temp;
- temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS));
+ temp = strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS));
if (temp == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
@@ -203,8 +203,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
#else
cifs_dbg(FYI, "set POSIX ACL not supported\n");
#endif
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
@@ -292,8 +292,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
full_path, ea_name, ea_value, buf_size,
cifs_sb->local_nls, cifs_remap(cifs_sb));
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+ strlen(XATTR_NAME_POSIX_ACL_ACCESS)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
@@ -303,8 +303,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
#else
cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
#endif /* CONFIG_CIFS_POSIX */
- } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+ } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
#ifdef CONFIG_CIFS_POSIX
if (sb->s_flags & MS_POSIXACL)
rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 7740b1c871c1..1bfb7ba4e85e 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -8,6 +8,7 @@
#include <linux/coda.h>
#include <linux/coda_psdev.h>
+#include <linux/pagemap.h>
#include "coda_linux.h"
static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
@@ -17,8 +18,7 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
static const struct inode_operations coda_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = coda_setattr,
};
@@ -35,6 +35,7 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
inode->i_fop = &coda_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &coda_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &coda_symlink_aops;
inode->i_mapping = &inode->i_data;
} else
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index f829fe963f5b..5104d84c4f64 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -72,8 +72,7 @@ void coda_sysctl_clean(void);
} while (0)
-#define CODA_FREE(ptr,size) \
- do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+#define CODA_FREE(ptr, size) kvfree((ptr))
/* inode to cnode access functions */
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index fda9f4311212..42e731b8c80a 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -427,13 +427,13 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
if (host_file->f_op->iterate) {
struct inode *host_inode = file_inode(host_file);
- mutex_lock(&host_inode->i_mutex);
+ inode_lock(host_inode);
ret = -ENOENT;
if (!IS_DEADDIR(host_inode)) {
ret = host_file->f_op->iterate(host_file, ctx);
file_accessed(host_file);
}
- mutex_unlock(&host_inode->i_mutex);
+ inode_unlock(host_inode);
return ret;
}
/* Venus: we must read Venus dirents from a file */
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 1da3805f3ddc..f47c7483863b 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -71,12 +71,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
host_file = cfi->cfi_container;
file_start_write(host_file);
- mutex_lock(&coda_inode->i_mutex);
+ inode_lock(coda_inode);
ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
- mutex_unlock(&coda_inode->i_mutex);
+ inode_unlock(coda_inode);
file_end_write(host_file);
return ret;
}
@@ -203,7 +203,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end);
if (err)
return err;
- mutex_lock(&coda_inode->i_mutex);
+ inode_lock(coda_inode);
cfi = CODA_FTOC(coda_file);
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -212,7 +212,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
err = vfs_fsync(host_file, datasync);
if (!err && !datasync)
err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
- mutex_unlock(&coda_inode->i_mutex);
+ inode_unlock(coda_inode);
return err;
}
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cac1390b87a3..57e81cbba0fa 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -74,9 +74,9 @@ static void init_once(void *foo)
int __init coda_init_inodecache(void)
{
coda_inode_cachep = kmem_cache_create("coda_inode_cache",
- sizeof(struct coda_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- init_once);
+ sizeof(struct coda_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, init_once);
if (coda_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index ab94ef63caef..03736e20d720 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -26,7 +26,7 @@ static int coda_symlink_filler(struct file *file, struct page *page)
int error;
struct coda_inode_info *cii;
unsigned int len = PAGE_SIZE;
- char *p = kmap(page);
+ char *p = page_address(page);
cii = ITOC(inode);
@@ -34,13 +34,11 @@ static int coda_symlink_filler(struct file *file, struct page *page)
if (error)
goto fail;
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return error;
}
diff --git a/fs/compat.c b/fs/compat.c
index 6fd272d455e4..a71936a3f4cb 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -792,7 +792,7 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
const void __user *, data)
{
char *kernel_type;
- unsigned long data_page;
+ void *options;
char *kernel_dev;
int retval;
@@ -806,26 +806,25 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
if (IS_ERR(kernel_dev))
goto out1;
- retval = copy_mount_options(data, &data_page);
- if (retval < 0)
+ options = copy_mount_options(data);
+ retval = PTR_ERR(options);
+ if (IS_ERR(options))
goto out2;
- retval = -EINVAL;
-
- if (kernel_type && data_page) {
+ if (kernel_type && options) {
if (!strcmp(kernel_type, NCPFS_NAME)) {
- do_ncp_super_data_conv((void *)data_page);
+ do_ncp_super_data_conv(options);
} else if (!strcmp(kernel_type, NFS4_NAME)) {
- if (do_nfs4_super_data_conv((void *) data_page))
+ retval = -EINVAL;
+ if (do_nfs4_super_data_conv(options))
goto out3;
}
}
- retval = do_mount(kernel_dev, dir_name, kernel_type,
- flags, (void*)data_page);
+ retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
out3:
- free_page(data_page);
+ kfree(options);
out2:
kfree(kernel_dev);
out1:
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 48851f6ea6ec..bd01b92aad98 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,6 +58,8 @@
#include <linux/atalk.h>
#include <linux/gfp.h>
+#include "internal.h"
+
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_sock.h>
#include <net/bluetooth/rfcomm.h>
@@ -115,19 +117,38 @@
#include <asm/fbio.h>
#endif
-static int w_long(unsigned int fd, unsigned int cmd,
- compat_ulong_t __user *argp)
+#define convert_in_user(srcptr, dstptr) \
+({ \
+ typeof(*srcptr) val; \
+ \
+ get_user(val, srcptr) || put_user(val, dstptr); \
+})
+
+static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- mm_segment_t old_fs = get_fs();
int err;
- unsigned long val;
- set_fs (KERNEL_DS);
- err = sys_ioctl(fd, cmd, (unsigned long)&val);
- set_fs (old_fs);
- if (!err && put_user(val, argp))
+ err = security_file_ioctl(file, cmd, arg);
+ if (err)
+ return err;
+
+ return vfs_ioctl(file, cmd, arg);
+}
+
+static int w_long(struct file *file,
+ unsigned int cmd, compat_ulong_t __user *argp)
+{
+ int err;
+ unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
+
+ if (valp == NULL)
return -EFAULT;
- return err;
+ err = do_ioctl(file, cmd, (unsigned long)valp);
+ if (err)
+ return err;
+ if (convert_in_user(valp, argp))
+ return -EFAULT;
+ return 0;
}
struct compat_video_event {
@@ -139,23 +160,23 @@ struct compat_video_event {
} u;
};
-static int do_video_get_event(unsigned int fd, unsigned int cmd,
- struct compat_video_event __user *up)
+static int do_video_get_event(struct file *file,
+ unsigned int cmd, struct compat_video_event __user *up)
{
- struct video_event kevent;
- mm_segment_t old_fs = get_fs();
+ struct video_event __user *kevent =
+ compat_alloc_user_space(sizeof(*kevent));
int err;
- set_fs(KERNEL_DS);
- err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
- set_fs(old_fs);
+ if (kevent == NULL)
+ return -EFAULT;
+ err = do_ioctl(file, cmd, (unsigned long)kevent);
if (!err) {
- err = put_user(kevent.type, &up->type);
- err |= put_user(kevent.timestamp, &up->timestamp);
- err |= put_user(kevent.u.size.w, &up->u.size.w);
- err |= put_user(kevent.u.size.h, &up->u.size.h);
- err |= put_user(kevent.u.size.aspect_ratio,
+ err = convert_in_user(&kevent->type, &up->type);
+ err |= convert_in_user(&kevent->timestamp, &up->timestamp);
+ err |= convert_in_user(&kevent->u.size.w, &up->u.size.w);
+ err |= convert_in_user(&kevent->u.size.h, &up->u.size.h);
+ err |= convert_in_user(&kevent->u.size.aspect_ratio,
&up->u.size.aspect_ratio);
if (err)
err = -EFAULT;
@@ -169,8 +190,8 @@ struct compat_video_still_picture {
int32_t size;
};
-static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
- struct compat_video_still_picture __user *up)
+static int do_video_stillpicture(struct file *file,
+ unsigned int cmd, struct compat_video_still_picture __user *up)
{
struct video_still_picture __user *up_native;
compat_uptr_t fp;
@@ -190,7 +211,7 @@ static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
if (err)
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+ err = do_ioctl(file, cmd, (unsigned long) up_native);
return err;
}
@@ -200,8 +221,8 @@ struct compat_video_spu_palette {
compat_uptr_t palette;
};
-static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
- struct compat_video_spu_palette __user *up)
+static int do_video_set_spu_palette(struct file *file,
+ unsigned int cmd, struct compat_video_spu_palette __user *up)
{
struct video_spu_palette __user *up_native;
compat_uptr_t palp;
@@ -218,7 +239,7 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
if (err)
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+ err = do_ioctl(file, cmd, (unsigned long) up_native);
return err;
}
@@ -276,7 +297,7 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov
return 0;
}
-static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
+static int sg_ioctl_trans(struct file *file, unsigned int cmd,
sg_io_hdr32_t __user *sgio32)
{
sg_io_hdr_t __user *sgio;
@@ -289,7 +310,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
if (get_user(interface_id, &sgio32->interface_id))
return -EFAULT;
if (interface_id != 'S')
- return sys_ioctl(fd, cmd, (unsigned long)sgio32);
+ return do_ioctl(file, cmd, (unsigned long)sgio32);
if (get_user(iovec_count, &sgio32->iovec_count))
return -EFAULT;
@@ -349,7 +370,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
if (put_user(compat_ptr(data), &sgio->usr_ptr))
return -EFAULT;
- err = sys_ioctl(fd, cmd, (unsigned long) sgio);
+ err = do_ioctl(file, cmd, (unsigned long) sgio);
if (err >= 0) {
void __user *datap;
@@ -380,13 +401,13 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
int unused;
};
-static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
- compat_sg_req_info __user *o)
+static int sg_grt_trans(struct file *file,
+ unsigned int cmd, struct compat_sg_req_info __user *o)
{
int err, i;
sg_req_info_t __user *r;
r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
- err = sys_ioctl(fd,cmd,(unsigned long)r);
+ err = do_ioctl(file, cmd, (unsigned long)r);
if (err < 0)
return err;
for (i = 0; i < SG_MAX_QUEUE; i++) {
@@ -412,8 +433,8 @@ struct sock_fprog32 {
#define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32)
#define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32)
-static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
- struct sock_fprog32 __user *u_fprog32)
+static int ppp_sock_fprog_ioctl_trans(struct file *file,
+ unsigned int cmd, struct sock_fprog32 __user *u_fprog32)
{
struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
void __user *fptr64;
@@ -435,7 +456,7 @@ static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
else
cmd = PPPIOCSACTIVE;
- return sys_ioctl(fd, cmd, (unsigned long) u_fprog64);
+ return do_ioctl(file, cmd, (unsigned long) u_fprog64);
}
struct ppp_option_data32 {
@@ -451,7 +472,7 @@ struct ppp_idle32 {
};
#define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32)
-static int ppp_gidle(unsigned int fd, unsigned int cmd,
+static int ppp_gidle(struct file *file, unsigned int cmd,
struct ppp_idle32 __user *idle32)
{
struct ppp_idle __user *idle;
@@ -460,7 +481,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd,
idle = compat_alloc_user_space(sizeof(*idle));
- err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
+ err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle);
if (!err) {
if (get_user(xmit, &idle->xmit_idle) ||
@@ -472,7 +493,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd,
return err;
}
-static int ppp_scompress(unsigned int fd, unsigned int cmd,
+static int ppp_scompress(struct file *file, unsigned int cmd,
struct ppp_option_data32 __user *odata32)
{
struct ppp_option_data __user *odata;
@@ -492,7 +513,7 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd,
sizeof(__u32) + sizeof(int)))
return -EFAULT;
- return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
+ return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata);
}
#ifdef CONFIG_BLOCK
@@ -512,12 +533,13 @@ struct mtpos32 {
};
#define MTIOCPOS32 _IOR('m', 3, struct mtpos32)
-static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
+static int mt_ioctl_trans(struct file *file,
+ unsigned int cmd, void __user *argp)
{
- mm_segment_t old_fs = get_fs();
- struct mtget get;
+ /* NULL initialization to make gcc shut up */
+ struct mtget __user *get = NULL;
struct mtget32 __user *umget32;
- struct mtpos pos;
+ struct mtpos __user *pos = NULL;
struct mtpos32 __user *upos32;
unsigned long kcmd;
void *karg;
@@ -526,32 +548,34 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
switch(cmd) {
case MTIOCPOS32:
kcmd = MTIOCPOS;
- karg = &pos;
+ pos = compat_alloc_user_space(sizeof(*pos));
+ karg = pos;
break;
default: /* MTIOCGET32 */
kcmd = MTIOCGET;
- karg = &get;
+ get = compat_alloc_user_space(sizeof(*get));
+ karg = get;
break;
}
- set_fs (KERNEL_DS);
- err = sys_ioctl (fd, kcmd, (unsigned long)karg);
- set_fs (old_fs);
+ if (karg == NULL)
+ return -EFAULT;
+ err = do_ioctl(file, kcmd, (unsigned long)karg);
if (err)
return err;
switch (cmd) {
case MTIOCPOS32:
upos32 = argp;
- err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
+ err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno);
break;
case MTIOCGET32:
umget32 = argp;
- err = __put_user(get.mt_type, &umget32->mt_type);
- err |= __put_user(get.mt_resid, &umget32->mt_resid);
- err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
- err |= __put_user(get.mt_gstat, &umget32->mt_gstat);
- err |= __put_user(get.mt_erreg, &umget32->mt_erreg);
- err |= __put_user(get.mt_fileno, &umget32->mt_fileno);
- err |= __put_user(get.mt_blkno, &umget32->mt_blkno);
+ err = convert_in_user(&get->mt_type, &umget32->mt_type);
+ err |= convert_in_user(&get->mt_resid, &umget32->mt_resid);
+ err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg);
+ err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat);
+ err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg);
+ err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno);
+ err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno);
break;
}
return err ? -EFAULT: 0;
@@ -605,42 +629,41 @@ struct serial_struct32 {
compat_int_t reserved[1];
};
-static int serial_struct_ioctl(unsigned fd, unsigned cmd,
- struct serial_struct32 __user *ss32)
+static int serial_struct_ioctl(struct file *file,
+ unsigned cmd, struct serial_struct32 __user *ss32)
{
typedef struct serial_struct32 SS32;
int err;
- struct serial_struct ss;
- mm_segment_t oldseg = get_fs();
+ struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss));
__u32 udata;
unsigned int base;
+ unsigned char *iomem_base;
+ if (ss == NULL)
+ return -EFAULT;
if (cmd == TIOCSSERIAL) {
- if (!access_ok(VERIFY_READ, ss32, sizeof(SS32)))
- return -EFAULT;
- if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base)))
- return -EFAULT;
- if (__get_user(udata, &ss32->iomem_base))
+ if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) ||
+ get_user(udata, &ss32->iomem_base))
return -EFAULT;
- ss.iomem_base = compat_ptr(udata);
- if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
- __get_user(ss.port_high, &ss32->port_high))
+ iomem_base = compat_ptr(udata);
+ if (put_user(iomem_base, &ss->iomem_base) ||
+ convert_in_user(&ss32->iomem_reg_shift,
+ &ss->iomem_reg_shift) ||
+ convert_in_user(&ss32->port_high, &ss->port_high) ||
+ put_user(0UL, &ss->iomap_base))
return -EFAULT;
- ss.iomap_base = 0UL;
}
- set_fs(KERNEL_DS);
- err = sys_ioctl(fd,cmd,(unsigned long)(&ss));
- set_fs(oldseg);
+ err = do_ioctl(file, cmd, (unsigned long)ss);
if (cmd == TIOCGSERIAL && err >= 0) {
- if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
- return -EFAULT;
- if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base)))
+ if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) ||
+ get_user(iomem_base, &ss->iomem_base))
return -EFAULT;
- base = (unsigned long)ss.iomem_base >> 32 ?
- 0xffffffff : (unsigned)(unsigned long)ss.iomem_base;
- if (__put_user(base, &ss32->iomem_base) ||
- __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
- __put_user(ss.port_high, &ss32->port_high))
+ base = (unsigned long)iomem_base >> 32 ?
+ 0xffffffff : (unsigned)(unsigned long)iomem_base;
+ if (put_user(base, &ss32->iomem_base) ||
+ convert_in_user(&ss->iomem_reg_shift,
+ &ss32->iomem_reg_shift) ||
+ convert_in_user(&ss->port_high, &ss32->port_high))
return -EFAULT;
}
return err;
@@ -674,8 +697,8 @@ struct i2c_rdwr_aligned {
struct i2c_msg msgs[0];
};
-static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
- struct i2c_rdwr_ioctl_data32 __user *udata)
+static int do_i2c_rdwr_ioctl(struct file *file,
+ unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata)
{
struct i2c_rdwr_aligned __user *tdata;
struct i2c_msg __user *tmsgs;
@@ -686,7 +709,7 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
if (get_user(nmsgs, &udata->nmsgs))
return -EFAULT;
- if (nmsgs > I2C_RDRW_IOCTL_MAX_MSGS)
+ if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS)
return -EINVAL;
if (get_user(datap, &udata->msgs))
@@ -708,11 +731,11 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
put_user(compat_ptr(datap), &tmsgs[i].buf))
return -EFAULT;
}
- return sys_ioctl(fd, cmd, (unsigned long)tdata);
+ return do_ioctl(file, cmd, (unsigned long)tdata);
}
-static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
- struct i2c_smbus_ioctl_data32 __user *udata)
+static int do_i2c_smbus_ioctl(struct file *file,
+ unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata)
{
struct i2c_smbus_ioctl_data __user *tdata;
compat_caddr_t datap;
@@ -734,7 +757,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
__put_user(compat_ptr(datap), &tdata->data))
return -EFAULT;
- return sys_ioctl(fd, cmd, (unsigned long)tdata);
+ return do_ioctl(file, cmd, (unsigned long)tdata);
}
#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
@@ -742,29 +765,27 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t)
-static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
+static int rtc_ioctl(struct file *file,
+ unsigned cmd, void __user *argp)
{
- mm_segment_t oldfs = get_fs();
- compat_ulong_t val32;
- unsigned long kval;
+ unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
int ret;
+ if (valp == NULL)
+ return -EFAULT;
switch (cmd) {
case RTC_IRQP_READ32:
case RTC_EPOCH_READ32:
- set_fs(KERNEL_DS);
- ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
+ ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ?
RTC_IRQP_READ : RTC_EPOCH_READ,
- (unsigned long)&kval);
- set_fs(oldfs);
+ (unsigned long)valp);
if (ret)
return ret;
- val32 = kval;
- return put_user(val32, (unsigned int __user *)argp);
+ return convert_in_user(valp, (unsigned int __user *)argp);
case RTC_IRQP_SET32:
- return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
+ return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp);
case RTC_EPOCH_SET32:
- return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
+ return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp);
}
return -ENOIOCTLCMD;
@@ -1019,28 +1040,6 @@ COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
/* PPPOX */
COMPATIBLE_IOCTL(PPPOEIOCSFWD)
COMPATIBLE_IOCTL(PPPOEIOCDFWD)
-/* ppdev */
-COMPATIBLE_IOCTL(PPSETMODE)
-COMPATIBLE_IOCTL(PPRSTATUS)
-COMPATIBLE_IOCTL(PPRCONTROL)
-COMPATIBLE_IOCTL(PPWCONTROL)
-COMPATIBLE_IOCTL(PPFCONTROL)
-COMPATIBLE_IOCTL(PPRDATA)
-COMPATIBLE_IOCTL(PPWDATA)
-COMPATIBLE_IOCTL(PPCLAIM)
-COMPATIBLE_IOCTL(PPRELEASE)
-COMPATIBLE_IOCTL(PPYIELD)
-COMPATIBLE_IOCTL(PPEXCL)
-COMPATIBLE_IOCTL(PPDATADIR)
-COMPATIBLE_IOCTL(PPNEGOT)
-COMPATIBLE_IOCTL(PPWCTLONIRQ)
-COMPATIBLE_IOCTL(PPCLRIRQ)
-COMPATIBLE_IOCTL(PPSETPHASE)
-COMPATIBLE_IOCTL(PPGETMODES)
-COMPATIBLE_IOCTL(PPGETMODE)
-COMPATIBLE_IOCTL(PPGETPHASE)
-COMPATIBLE_IOCTL(PPGETFLAGS)
-COMPATIBLE_IOCTL(PPSETFLAGS)
/* Big A */
/* sparc only */
/* Big Q for sound/OSS */
@@ -1240,6 +1239,9 @@ COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
COMPATIBLE_IOCTL(HCIINQUIRY)
COMPATIBLE_IOCTL(HCIUARTSETPROTO)
COMPATIBLE_IOCTL(HCIUARTGETPROTO)
+COMPATIBLE_IOCTL(HCIUARTGETDEVICE)
+COMPATIBLE_IOCTL(HCIUARTSETFLAGS)
+COMPATIBLE_IOCTL(HCIUARTGETFLAGS)
COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
@@ -1284,12 +1286,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* NBD */
-COMPATIBLE_IOCTL(NBD_DO_IT)
-COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
-COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
-COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
-COMPATIBLE_IOCTL(NBD_DISCONNECT)
/* i2c */
COMPATIBLE_IOCTL(I2C_SLAVE)
COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
@@ -1436,53 +1432,53 @@ IGNORE_IOCTL(FBIOGCURSOR32)
* a compat_ioctl operation in the place that handleѕ the
* ioctl for the native case.
*/
-static long do_ioctl_trans(int fd, unsigned int cmd,
+static long do_ioctl_trans(unsigned int cmd,
unsigned long arg, struct file *file)
{
void __user *argp = compat_ptr(arg);
switch (cmd) {
case PPPIOCGIDLE32:
- return ppp_gidle(fd, cmd, argp);
+ return ppp_gidle(file, cmd, argp);
case PPPIOCSCOMPRESS32:
- return ppp_scompress(fd, cmd, argp);
+ return ppp_scompress(file, cmd, argp);
case PPPIOCSPASS32:
case PPPIOCSACTIVE32:
- return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
+ return ppp_sock_fprog_ioctl_trans(file, cmd, argp);
#ifdef CONFIG_BLOCK
case SG_IO:
- return sg_ioctl_trans(fd, cmd, argp);
+ return sg_ioctl_trans(file, cmd, argp);
case SG_GET_REQUEST_TABLE:
- return sg_grt_trans(fd, cmd, argp);
+ return sg_grt_trans(file, cmd, argp);
case MTIOCGET32:
case MTIOCPOS32:
- return mt_ioctl_trans(fd, cmd, argp);
+ return mt_ioctl_trans(file, cmd, argp);
#endif
/* Serial */
case TIOCGSERIAL:
case TIOCSSERIAL:
- return serial_struct_ioctl(fd, cmd, argp);
+ return serial_struct_ioctl(file, cmd, argp);
/* i2c */
case I2C_FUNCS:
- return w_long(fd, cmd, argp);
+ return w_long(file, cmd, argp);
case I2C_RDWR:
- return do_i2c_rdwr_ioctl(fd, cmd, argp);
+ return do_i2c_rdwr_ioctl(file, cmd, argp);
case I2C_SMBUS:
- return do_i2c_smbus_ioctl(fd, cmd, argp);
+ return do_i2c_smbus_ioctl(file, cmd, argp);
/* Not implemented in the native kernel */
case RTC_IRQP_READ32:
case RTC_IRQP_SET32:
case RTC_EPOCH_READ32:
case RTC_EPOCH_SET32:
- return rtc_ioctl(fd, cmd, argp);
+ return rtc_ioctl(file, cmd, argp);
/* dvb */
case VIDEO_GET_EVENT:
- return do_video_get_event(fd, cmd, argp);
+ return do_video_get_event(file, cmd, argp);
case VIDEO_STILLPICTURE:
- return do_video_stillpicture(fd, cmd, argp);
+ return do_video_stillpicture(file, cmd, argp);
case VIDEO_SET_SPU_PALETTE:
- return do_video_set_spu_palette(fd, cmd, argp);
+ return do_video_set_spu_palette(file, cmd, argp);
}
/*
@@ -1508,12 +1504,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
case KDSKBMETA:
case KDSKBLED:
case KDSETLED:
- /* NBD */
- case NBD_SET_SOCK:
- case NBD_SET_BLKSIZE:
- case NBD_SET_SIZE:
- case NBD_SET_SIZE_BLOCKS:
- return do_vfs_ioctl(file, fd, cmd, arg);
+ return vfs_ioctl(file, cmd, arg);
}
return -ENOIOCTLCMD;
@@ -1580,6 +1571,11 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
goto out_fput;
#endif
+ case FICLONE:
+ case FICLONERANGE:
+ case FIDEDUPERANGE:
+ goto do_ioctl;
+
case FIBMAP:
case FIGETBSZ:
case FIONREAD:
@@ -1602,7 +1598,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
if (compat_ioctl_check_table(XFORM(cmd)))
goto found_handler;
- error = do_ioctl_trans(fd, cmd, arg, f.file);
+ error = do_ioctl_trans(cmd, arg, f.file);
if (error == -ENOIOCTLCMD)
error = -ENOTTY;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index b65d1ef532d5..ccc31fa6f1a7 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -53,13 +53,14 @@ struct configfs_dirent {
#define CONFIGFS_ROOT 0x0001
#define CONFIGFS_DIR 0x0002
#define CONFIGFS_ITEM_ATTR 0x0004
+#define CONFIGFS_ITEM_BIN_ATTR 0x0008
#define CONFIGFS_ITEM_LINK 0x0020
#define CONFIGFS_USET_DIR 0x0040
#define CONFIGFS_USET_DEFAULT 0x0080
#define CONFIGFS_USET_DROPPING 0x0100
#define CONFIGFS_USET_IN_MKDIR 0x0200
#define CONFIGFS_USET_CREATING 0x0400
-#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
+#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)
extern struct mutex configfs_symlink_mutex;
extern spinlock_t configfs_dirent_lock;
@@ -72,6 +73,8 @@ extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *,
extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *));
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_create_bin_file(struct config_item *,
+ const struct configfs_bin_attribute *);
extern int configfs_make_dirent(struct configfs_dirent *,
struct dentry *, void *, umode_t, int);
extern int configfs_dirent_is_ready(struct configfs_dirent *);
@@ -88,7 +91,7 @@ extern void configfs_release_fs(void);
extern struct rw_semaphore configfs_rename_sem;
extern const struct file_operations configfs_dir_operations;
extern const struct file_operations configfs_file_operations;
-extern const struct file_operations bin_fops;
+extern const struct file_operations configfs_bin_file_operations;
extern const struct inode_operations configfs_dir_inode_operations;
extern const struct inode_operations configfs_root_inode_operations;
extern const struct inode_operations configfs_symlink_inode_operations;
@@ -119,6 +122,13 @@ static inline struct configfs_attribute * to_attr(struct dentry * dentry)
return ((struct configfs_attribute *) sd->s_element);
}
+static inline struct configfs_bin_attribute *to_bin_attr(struct dentry *dentry)
+{
+ struct configfs_attribute *attr = to_attr(dentry);
+
+ return container_of(attr, struct configfs_bin_attribute, cb_attr);
+}
+
static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
{
struct config_item * item = NULL;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index c81ce7f200a6..ea59c891fc53 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -255,6 +255,12 @@ static void configfs_init_file(struct inode * inode)
inode->i_fop = &configfs_file_operations;
}
+static void configfs_init_bin_file(struct inode *inode)
+{
+ inode->i_size = 0;
+ inode->i_fop = &configfs_bin_file_operations;
+}
+
static void init_symlink(struct inode * inode)
{
inode->i_op = &configfs_symlink_inode_operations;
@@ -423,15 +429,12 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
spin_unlock(&configfs_dirent_lock);
error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
- configfs_init_file);
- if (error) {
+ (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ?
+ configfs_init_bin_file :
+ configfs_init_file);
+ if (error)
configfs_put(sd);
- return error;
- }
-
- d_rehash(dentry);
-
- return 0;
+ return error;
}
static struct dentry * configfs_lookup(struct inode *dir,
@@ -583,6 +586,7 @@ static int populate_attrs(struct config_item *item)
{
struct config_item_type *t = item->ci_type;
struct configfs_attribute *attr;
+ struct configfs_bin_attribute *bin_attr;
int error = 0;
int i;
@@ -594,6 +598,13 @@ static int populate_attrs(struct config_item *item)
break;
}
}
+ if (t->ct_bin_attrs) {
+ for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
+ error = configfs_create_bin_file(item, bin_attr);
+ if (error)
+ break;
+ }
+ }
if (error)
detach_attrs(item);
@@ -624,13 +635,13 @@ static void detach_groups(struct config_group *group)
child = sd->s_dentry;
- mutex_lock(&d_inode(child)->i_mutex);
+ inode_lock(d_inode(child));
configfs_detach_group(sd->s_element);
d_inode(child)->i_flags |= S_DEAD;
dont_mount(child);
- mutex_unlock(&d_inode(child)->i_mutex);
+ inode_unlock(d_inode(child));
d_delete(child);
dput(child);
@@ -685,23 +696,29 @@ static int populate_groups(struct config_group *group)
{
struct config_group *new_group;
int ret = 0;
- int i;
- if (group->default_groups) {
- for (i = 0; group->default_groups[i]; i++) {
- new_group = group->default_groups[i];
-
- ret = create_default_group(group, new_group);
- if (ret) {
- detach_groups(group);
- break;
- }
+ list_for_each_entry(new_group, &group->default_groups, group_entry) {
+ ret = create_default_group(group, new_group);
+ if (ret) {
+ detach_groups(group);
+ break;
}
}
return ret;
}
+void configfs_remove_default_groups(struct config_group *group)
+{
+ struct config_group *g, *n;
+
+ list_for_each_entry_safe(g, n, &group->default_groups, group_entry) {
+ list_del(&g->group_entry);
+ config_item_put(&g->cg_item);
+ }
+}
+EXPORT_SYMBOL(configfs_remove_default_groups);
+
/*
* All of link_obj/unlink_obj/link_group/unlink_group require that
* subsys->su_mutex is held.
@@ -750,15 +767,10 @@ static void link_obj(struct config_item *parent_item, struct config_item *item)
static void unlink_group(struct config_group *group)
{
- int i;
struct config_group *new_group;
- if (group->default_groups) {
- for (i = 0; group->default_groups[i]; i++) {
- new_group = group->default_groups[i];
- unlink_group(new_group);
- }
- }
+ list_for_each_entry(new_group, &group->default_groups, group_entry)
+ unlink_group(new_group);
group->cg_subsys = NULL;
unlink_obj(&group->cg_item);
@@ -766,7 +778,6 @@ static void unlink_group(struct config_group *group)
static void link_group(struct config_group *parent_group, struct config_group *group)
{
- int i;
struct config_group *new_group;
struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
@@ -780,12 +791,8 @@ static void link_group(struct config_group *parent_group, struct config_group *g
BUG();
group->cg_subsys = subsys;
- if (group->default_groups) {
- for (i = 0; group->default_groups[i]; i++) {
- new_group = group->default_groups[i];
- link_group(group, new_group);
- }
- }
+ list_for_each_entry(new_group, &group->default_groups, group_entry)
+ link_group(group, new_group);
}
/*
@@ -818,11 +825,11 @@ static int configfs_attach_item(struct config_item *parent_item,
* the VFS may already have hit and used them. Thus,
* we must lock them as rmdir() would.
*/
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
configfs_remove_dir(item);
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
d_delete(dentry);
}
}
@@ -858,7 +865,7 @@ static int configfs_attach_group(struct config_item *parent_item,
* We must also lock the inode to remove it safely in case of
* error, as rmdir() would.
*/
- mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
configfs_adjust_dir_dirent_depth_before_populate(sd);
ret = populate_groups(to_config_group(item));
if (ret) {
@@ -867,7 +874,7 @@ static int configfs_attach_group(struct config_item *parent_item,
dont_mount(dentry);
}
configfs_adjust_dir_dirent_depth_after_populate(sd);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
if (ret)
d_delete(dentry);
}
@@ -1054,11 +1061,55 @@ out:
return ret;
}
+static int configfs_do_depend_item(struct dentry *subsys_dentry,
+ struct config_item *target)
+{
+ struct configfs_dirent *p;
+ int ret;
+
+ spin_lock(&configfs_dirent_lock);
+ /* Scan the tree, return 0 if found */
+ ret = configfs_depend_prep(subsys_dentry, target);
+ if (ret)
+ goto out_unlock_dirent_lock;
+
+ /*
+ * We are sure that the item is not about to be removed by rmdir(), and
+ * not in the middle of attachment by mkdir().
+ */
+ p = target->ci_dentry->d_fsdata;
+ p->s_dependent_count += 1;
+
+out_unlock_dirent_lock:
+ spin_unlock(&configfs_dirent_lock);
+
+ return ret;
+}
+
+static inline struct configfs_dirent *
+configfs_find_subsys_dentry(struct configfs_dirent *root_sd,
+ struct config_item *subsys_item)
+{
+ struct configfs_dirent *p;
+ struct configfs_dirent *ret = NULL;
+
+ list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+ if (p->s_type & CONFIGFS_DIR &&
+ p->s_element == subsys_item) {
+ ret = p;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+
int configfs_depend_item(struct configfs_subsystem *subsys,
struct config_item *target)
{
int ret;
- struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+ struct configfs_dirent *subsys_sd;
struct config_item *s_item = &subsys->su_group.cg_item;
struct dentry *root;
@@ -1075,43 +1126,19 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
* subsystem is really registered, and so we need to lock out
* configfs_[un]register_subsystem().
*/
- mutex_lock(&d_inode(root)->i_mutex);
-
- root_sd = root->d_fsdata;
-
- list_for_each_entry(p, &root_sd->s_children, s_sibling) {
- if (p->s_type & CONFIGFS_DIR) {
- if (p->s_element == s_item) {
- subsys_sd = p;
- break;
- }
- }
- }
+ inode_lock(d_inode(root));
+ subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item);
if (!subsys_sd) {
ret = -ENOENT;
goto out_unlock_fs;
}
/* Ok, now we can trust subsys/s_item */
+ ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
- spin_lock(&configfs_dirent_lock);
- /* Scan the tree, return 0 if found */
- ret = configfs_depend_prep(subsys_sd->s_dentry, target);
- if (ret)
- goto out_unlock_dirent_lock;
-
- /*
- * We are sure that the item is not about to be removed by rmdir(), and
- * not in the middle of attachment by mkdir().
- */
- p = target->ci_dentry->d_fsdata;
- p->s_dependent_count += 1;
-
-out_unlock_dirent_lock:
- spin_unlock(&configfs_dirent_lock);
out_unlock_fs:
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
/*
* If we succeeded, the fs is pinned via other methods. If not,
@@ -1128,8 +1155,7 @@ EXPORT_SYMBOL(configfs_depend_item);
* configfs_depend_item() because we know that that the client driver is
* pinned, thus the subsystem is pinned, and therefore configfs is pinned.
*/
-void configfs_undepend_item(struct configfs_subsystem *subsys,
- struct config_item *target)
+void configfs_undepend_item(struct config_item *target)
{
struct configfs_dirent *sd;
@@ -1152,6 +1178,79 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
}
EXPORT_SYMBOL(configfs_undepend_item);
+/*
+ * caller_subsys is a caller's subsystem not target's. This is used to
+ * determine if we should lock root and check subsys or not. When we are
+ * in the same subsystem as our target there is no need to do locking as
+ * we know that subsys is valid and is not unregistered during this function
+ * as we are called from callback of one of his children and VFS holds a lock
+ * on some inode. Otherwise we have to lock our root to ensure that target's
+ * subsystem it is not unregistered during this function.
+ */
+int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys,
+ struct config_item *target)
+{
+ struct configfs_subsystem *target_subsys;
+ struct config_group *root, *parent;
+ struct configfs_dirent *subsys_sd;
+ int ret = -ENOENT;
+
+ /* Disallow this function for configfs root */
+ if (configfs_is_root(target))
+ return -EINVAL;
+
+ parent = target->ci_group;
+ /*
+ * This may happen when someone is trying to depend root
+ * directory of some subsystem
+ */
+ if (configfs_is_root(&parent->cg_item)) {
+ target_subsys = to_configfs_subsystem(to_config_group(target));
+ root = parent;
+ } else {
+ target_subsys = parent->cg_subsys;
+ /* Find a cofnigfs root as we may need it for locking */
+ for (root = parent; !configfs_is_root(&root->cg_item);
+ root = root->cg_item.ci_group)
+ ;
+ }
+
+ if (target_subsys != caller_subsys) {
+ /*
+ * We are in other configfs subsystem, so we have to do
+ * additional locking to prevent other subsystem from being
+ * unregistered
+ */
+ inode_lock(d_inode(root->cg_item.ci_dentry));
+
+ /*
+ * As we are trying to depend item from other subsystem
+ * we have to check if this subsystem is still registered
+ */
+ subsys_sd = configfs_find_subsys_dentry(
+ root->cg_item.ci_dentry->d_fsdata,
+ &target_subsys->su_group.cg_item);
+ if (!subsys_sd)
+ goto out_root_unlock;
+ } else {
+ subsys_sd = target_subsys->su_group.cg_item.ci_dentry->d_fsdata;
+ }
+
+ /* Now we can execute core of depend item */
+ ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
+
+ if (target_subsys != caller_subsys)
+out_root_unlock:
+ /*
+ * We were called from subsystem other than our target so we
+ * took some locks so now it's time to release them
+ */
+ inode_unlock(d_inode(root->cg_item.ci_dentry));
+
+ return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item_unlocked);
+
static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
int ret = 0;
@@ -1453,7 +1552,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
down_write(&configfs_rename_sem);
parent = item->parent->dentry;
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
if (!IS_ERR(new_dentry)) {
@@ -1469,7 +1568,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
error = -EEXIST;
dput(new_dentry);
}
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
up_write(&configfs_rename_sem);
return error;
@@ -1482,7 +1581,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
struct configfs_dirent * parent_sd = dentry->d_fsdata;
int err;
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
/*
* Fake invisibility if dir belongs to a group/default groups hierarchy
* being attached
@@ -1495,7 +1594,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
else
err = 0;
}
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return err;
}
@@ -1505,11 +1604,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
struct dentry * dentry = file->f_path.dentry;
struct configfs_dirent * cursor = file->private_data;
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
spin_lock(&configfs_dirent_lock);
list_del_init(&cursor->s_sibling);
spin_unlock(&configfs_dirent_lock);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
release_configfs_dirent(cursor);
@@ -1590,7 +1689,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
{
struct dentry * dentry = file->f_path.dentry;
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
switch (whence) {
case 1:
offset += file->f_pos;
@@ -1598,7 +1697,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return -EINVAL;
}
if (offset != file->f_pos) {
@@ -1624,7 +1723,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
spin_unlock(&configfs_dirent_lock);
}
}
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return offset;
}
@@ -1636,6 +1735,116 @@ const struct file_operations configfs_dir_operations = {
.iterate = configfs_readdir,
};
+/**
+ * configfs_register_group - creates a parent-child relation between two groups
+ * @parent_group: parent group
+ * @group: child group
+ *
+ * link groups, creates dentry for the child and attaches it to the
+ * parent dentry.
+ *
+ * Return: 0 on success, negative errno code on error
+ */
+int configfs_register_group(struct config_group *parent_group,
+ struct config_group *group)
+{
+ struct configfs_subsystem *subsys = parent_group->cg_subsys;
+ struct dentry *parent;
+ int ret;
+
+ mutex_lock(&subsys->su_mutex);
+ link_group(parent_group, group);
+ mutex_unlock(&subsys->su_mutex);
+
+ parent = parent_group->cg_item.ci_dentry;
+
+ inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
+ ret = create_default_group(parent_group, group);
+ if (!ret) {
+ spin_lock(&configfs_dirent_lock);
+ configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
+ spin_unlock(&configfs_dirent_lock);
+ }
+ inode_unlock(d_inode(parent));
+ return ret;
+}
+EXPORT_SYMBOL(configfs_register_group);
+
+/**
+ * configfs_unregister_group() - unregisters a child group from its parent
+ * @group: parent group to be unregistered
+ *
+ * Undoes configfs_register_group()
+ */
+void configfs_unregister_group(struct config_group *group)
+{
+ struct configfs_subsystem *subsys = group->cg_subsys;
+ struct dentry *dentry = group->cg_item.ci_dentry;
+ struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
+
+ inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
+ spin_lock(&configfs_dirent_lock);
+ configfs_detach_prep(dentry, NULL);
+ spin_unlock(&configfs_dirent_lock);
+
+ configfs_detach_group(&group->cg_item);
+ d_inode(dentry)->i_flags |= S_DEAD;
+ dont_mount(dentry);
+ d_delete(dentry);
+ inode_unlock(d_inode(parent));
+
+ dput(dentry);
+
+ mutex_lock(&subsys->su_mutex);
+ unlink_group(group);
+ mutex_unlock(&subsys->su_mutex);
+}
+EXPORT_SYMBOL(configfs_unregister_group);
+
+/**
+ * configfs_register_default_group() - allocates and registers a child group
+ * @parent_group: parent group
+ * @name: child group name
+ * @item_type: child item type description
+ *
+ * boilerplate to allocate and register a child group with its parent. We need
+ * kzalloc'ed memory because child's default_group is initially empty.
+ *
+ * Return: allocated config group or ERR_PTR() on error
+ */
+struct config_group *
+configfs_register_default_group(struct config_group *parent_group,
+ const char *name,
+ struct config_item_type *item_type)
+{
+ int ret;
+ struct config_group *group;
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+ if (!group)
+ return ERR_PTR(-ENOMEM);
+ config_group_init_type_name(group, name, item_type);
+
+ ret = configfs_register_group(parent_group, group);
+ if (ret) {
+ kfree(group);
+ return ERR_PTR(ret);
+ }
+ return group;
+}
+EXPORT_SYMBOL(configfs_register_default_group);
+
+/**
+ * configfs_unregister_default_group() - unregisters and frees a child group
+ * @group: the group to act on
+ */
+void configfs_unregister_default_group(struct config_group *group)
+{
+ configfs_unregister_group(group);
+ kfree(group);
+}
+EXPORT_SYMBOL(configfs_unregister_default_group);
+
int configfs_register_subsystem(struct configfs_subsystem *subsys)
{
int err;
@@ -1654,7 +1863,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
sd = root->d_fsdata;
link_group(to_config_group(sd->s_element), group);
- mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
err = -ENOMEM;
dentry = d_alloc_name(root, group->cg_item.ci_name);
@@ -1674,7 +1883,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
}
}
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
if (err) {
unlink_group(group);
@@ -1695,9 +1904,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
return;
}
- mutex_lock_nested(&d_inode(root)->i_mutex,
+ inode_lock_nested(d_inode(root),
I_MUTEX_PARENT);
- mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
mutex_lock(&configfs_symlink_mutex);
spin_lock(&configfs_dirent_lock);
if (configfs_detach_prep(dentry, NULL)) {
@@ -1708,11 +1917,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
configfs_detach_group(&group->cg_item);
d_inode(dentry)->i_flags |= S_DEAD;
dont_mount(dentry);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
d_delete(dentry);
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
dput(dentry);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 403269ffcdf3..33b7ee34eda5 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -28,6 +28,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mutex.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
#include <linux/configfs.h>
@@ -48,6 +49,10 @@ struct configfs_buffer {
struct configfs_item_operations * ops;
struct mutex mutex;
int needs_read_fill;
+ bool read_in_progress;
+ bool write_in_progress;
+ char *bin_buffer;
+ int bin_buffer_size;
};
@@ -65,7 +70,6 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
{
struct configfs_attribute * attr = to_attr(dentry);
struct config_item * item = to_item(dentry->d_parent);
- struct configfs_item_operations * ops = buffer->ops;
int ret = 0;
ssize_t count;
@@ -74,7 +78,8 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
if (!buffer->page)
return -ENOMEM;
- count = ops->show_attribute(item,attr,buffer->page);
+ count = attr->show(item, buffer->page);
+
buffer->needs_read_fill = 0;
BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
if (count >= 0)
@@ -123,6 +128,87 @@ out:
return retval;
}
+/**
+ * configfs_read_bin_file - read a binary attribute.
+ * @file: file pointer.
+ * @buf: buffer to fill.
+ * @count: number of bytes to read.
+ * @ppos: starting offset in file.
+ *
+ * Userspace wants to read a binary attribute file. The attribute
+ * descriptor is in the file's ->d_fsdata. The target item is in the
+ * directory's ->d_fsdata.
+ *
+ * We check whether we need to refill the buffer. If so we will
+ * call the attributes' attr->read() twice. The first time we
+ * will pass a NULL as a buffer pointer, which the attributes' method
+ * will use to return the size of the buffer required. If no error
+ * occurs we will allocate the buffer using vmalloc and call
+ * attr->read() again passing that buffer as an argument.
+ * Then we just copy to user-space using simple_read_from_buffer.
+ */
+
+static ssize_t
+configfs_read_bin_file(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct configfs_buffer *buffer = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct config_item *item = to_item(dentry->d_parent);
+ struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+ ssize_t retval = 0;
+ ssize_t len = min_t(size_t, count, PAGE_SIZE);
+
+ mutex_lock(&buffer->mutex);
+
+ /* we don't support switching read/write modes */
+ if (buffer->write_in_progress) {
+ retval = -ETXTBSY;
+ goto out;
+ }
+ buffer->read_in_progress = 1;
+
+ if (buffer->needs_read_fill) {
+ /* perform first read with buf == NULL to get extent */
+ len = bin_attr->read(item, NULL, 0);
+ if (len <= 0) {
+ retval = len;
+ goto out;
+ }
+
+ /* do not exceed the maximum value */
+ if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) {
+ retval = -EFBIG;
+ goto out;
+ }
+
+ buffer->bin_buffer = vmalloc(len);
+ if (buffer->bin_buffer == NULL) {
+ retval = -ENOMEM;
+ goto out;
+ }
+ buffer->bin_buffer_size = len;
+
+ /* perform second read to fill buffer */
+ len = bin_attr->read(item, buffer->bin_buffer, len);
+ if (len < 0) {
+ retval = len;
+ vfree(buffer->bin_buffer);
+ buffer->bin_buffer_size = 0;
+ buffer->bin_buffer = NULL;
+ goto out;
+ }
+
+ buffer->needs_read_fill = 0;
+ }
+
+ retval = simple_read_from_buffer(buf, count, ppos, buffer->bin_buffer,
+ buffer->bin_buffer_size);
+out:
+ mutex_unlock(&buffer->mutex);
+ return retval;
+}
+
/**
* fill_write_buffer - copy buffer from userspace.
@@ -171,9 +257,8 @@ flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size
{
struct configfs_attribute * attr = to_attr(dentry);
struct config_item * item = to_item(dentry->d_parent);
- struct configfs_item_operations * ops = buffer->ops;
- return ops->store_attribute(item,attr,buffer->page,count);
+ return attr->store(item, buffer->page, count);
}
@@ -210,10 +295,80 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
return len;
}
-static int check_perm(struct inode * inode, struct file * file)
+/**
+ * configfs_write_bin_file - write a binary attribute.
+ * @file: file pointer
+ * @buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Writing to a binary attribute file is similar to a normal read.
+ * We buffer the consecutive writes (binary attribute files do not
+ * support lseek) in a continuously growing buffer, but we don't
+ * commit until the close of the file.
+ */
+
+static ssize_t
+configfs_write_bin_file(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct configfs_buffer *buffer = file->private_data;
+ struct dentry *dentry = file->f_path.dentry;
+ struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+ void *tbuf = NULL;
+ ssize_t len;
+
+ mutex_lock(&buffer->mutex);
+
+ /* we don't support switching read/write modes */
+ if (buffer->read_in_progress) {
+ len = -ETXTBSY;
+ goto out;
+ }
+ buffer->write_in_progress = 1;
+
+ /* buffer grows? */
+ if (*ppos + count > buffer->bin_buffer_size) {
+
+ if (bin_attr->cb_max_size &&
+ *ppos + count > bin_attr->cb_max_size) {
+ len = -EFBIG;
+ }
+
+ tbuf = vmalloc(*ppos + count);
+ if (tbuf == NULL) {
+ len = -ENOMEM;
+ goto out;
+ }
+
+ /* copy old contents */
+ if (buffer->bin_buffer) {
+ memcpy(tbuf, buffer->bin_buffer,
+ buffer->bin_buffer_size);
+ vfree(buffer->bin_buffer);
+ }
+
+ /* clear the new area */
+ memset(tbuf + buffer->bin_buffer_size, 0,
+ *ppos + count - buffer->bin_buffer_size);
+ buffer->bin_buffer = tbuf;
+ buffer->bin_buffer_size = *ppos + count;
+ }
+
+ len = simple_write_to_buffer(buffer->bin_buffer,
+ buffer->bin_buffer_size, ppos, buf, count);
+ if (len > 0)
+ *ppos += len;
+out:
+ mutex_unlock(&buffer->mutex);
+ return len;
+}
+
+static int check_perm(struct inode * inode, struct file * file, int type)
{
struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent);
struct configfs_attribute * attr = to_attr(file->f_path.dentry);
+ struct configfs_bin_attribute *bin_attr = NULL;
struct configfs_buffer * buffer;
struct configfs_item_operations * ops = NULL;
int error = 0;
@@ -221,6 +376,9 @@ static int check_perm(struct inode * inode, struct file * file)
if (!item || !attr)
goto Einval;
+ if (type & CONFIGFS_ITEM_BIN_ATTR)
+ bin_attr = to_bin_attr(file->f_path.dentry);
+
/* Grab the module reference for this attribute if we have one */
if (!try_module_get(attr->ca_owner)) {
error = -ENODEV;
@@ -237,10 +395,14 @@ static int check_perm(struct inode * inode, struct file * file)
* and we must have a store method.
*/
if (file->f_mode & FMODE_WRITE) {
+ if (!(inode->i_mode & S_IWUGO))
+ goto Eaccess;
- if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
+ if ((type & CONFIGFS_ITEM_ATTR) && !attr->store)
goto Eaccess;
+ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write)
+ goto Eaccess;
}
/* File needs read support.
@@ -248,7 +410,13 @@ static int check_perm(struct inode * inode, struct file * file)
* must be a show method for it.
*/
if (file->f_mode & FMODE_READ) {
- if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
+ if (!(inode->i_mode & S_IRUGO))
+ goto Eaccess;
+
+ if ((type & CONFIGFS_ITEM_ATTR) && !attr->show)
+ goto Eaccess;
+
+ if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read)
goto Eaccess;
}
@@ -262,6 +430,8 @@ static int check_perm(struct inode * inode, struct file * file)
}
mutex_init(&buffer->mutex);
buffer->needs_read_fill = 1;
+ buffer->read_in_progress = 0;
+ buffer->write_in_progress = 0;
buffer->ops = ops;
file->private_data = buffer;
goto Done;
@@ -279,12 +449,7 @@ static int check_perm(struct inode * inode, struct file * file)
return error;
}
-static int configfs_open_file(struct inode * inode, struct file * filp)
-{
- return check_perm(inode,filp);
-}
-
-static int configfs_release(struct inode * inode, struct file * filp)
+static int configfs_release(struct inode *inode, struct file *filp)
{
struct config_item * item = to_item(filp->f_path.dentry->d_parent);
struct configfs_attribute * attr = to_attr(filp->f_path.dentry);
@@ -305,6 +470,47 @@ static int configfs_release(struct inode * inode, struct file * filp)
return 0;
}
+static int configfs_open_file(struct inode *inode, struct file *filp)
+{
+ return check_perm(inode, filp, CONFIGFS_ITEM_ATTR);
+}
+
+static int configfs_open_bin_file(struct inode *inode, struct file *filp)
+{
+ return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR);
+}
+
+static int configfs_release_bin_file(struct inode *inode, struct file *filp)
+{
+ struct configfs_buffer *buffer = filp->private_data;
+ struct dentry *dentry = filp->f_path.dentry;
+ struct config_item *item = to_item(dentry->d_parent);
+ struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+ ssize_t len = 0;
+ int ret;
+
+ buffer->read_in_progress = 0;
+
+ if (buffer->write_in_progress) {
+ buffer->write_in_progress = 0;
+
+ len = bin_attr->write(item, buffer->bin_buffer,
+ buffer->bin_buffer_size);
+
+ /* vfree on NULL is safe */
+ vfree(buffer->bin_buffer);
+ buffer->bin_buffer = NULL;
+ buffer->bin_buffer_size = 0;
+ buffer->needs_read_fill = 1;
+ }
+
+ ret = configfs_release(inode, filp);
+ if (len < 0)
+ return len;
+ return ret;
+}
+
+
const struct file_operations configfs_file_operations = {
.read = configfs_read_file,
.write = configfs_write_file,
@@ -313,6 +519,14 @@ const struct file_operations configfs_file_operations = {
.release = configfs_release,
};
+const struct file_operations configfs_bin_file_operations = {
+ .read = configfs_read_bin_file,
+ .write = configfs_write_bin_file,
+ .llseek = NULL, /* bin file is not seekable */
+ .open = configfs_open_bin_file,
+ .release = configfs_release_bin_file,
+};
+
/**
* configfs_create_file - create an attribute file for an item.
* @item: item we're creating for.
@@ -326,11 +540,32 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
int error = 0;
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL);
+ inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL);
error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
CONFIGFS_ITEM_ATTR);
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
return error;
}
+/**
+ * configfs_create_bin_file - create a binary attribute file for an item.
+ * @item: item we're creating for.
+ * @attr: atrribute descriptor.
+ */
+
+int configfs_create_bin_file(struct config_item *item,
+ const struct configfs_bin_attribute *bin_attr)
+{
+ struct dentry *dir = item->ci_dentry;
+ struct configfs_dirent *parent_sd = dir->d_fsdata;
+ umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG;
+ int error = 0;
+
+ inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL);
+ error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode,
+ CONFIGFS_ITEM_BIN_ATTR);
+ inode_unlock(dir->d_inode);
+
+ return error;
+}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index eae87575e681..03d124ae27d7 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -75,7 +75,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
sd_iattr->ia_mode = sd->s_mode;
sd_iattr->ia_uid = GLOBAL_ROOT_UID;
sd_iattr->ia_gid = GLOBAL_ROOT_GID;
- sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
+ sd_iattr->ia_atime = sd_iattr->ia_mtime =
+ sd_iattr->ia_ctime = current_fs_time(inode->i_sb);
sd->s_iattr = sd_iattr;
}
/* attributes were changed atleast once in past */
@@ -111,7 +112,8 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime =
+ inode->i_ctime = current_fs_time(inode->i_sb);
}
static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -195,13 +197,21 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in
return -ENOMEM;
p_inode = d_inode(dentry->d_parent);
- p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
+ p_inode->i_mtime = p_inode->i_ctime = current_fs_time(p_inode->i_sb);
configfs_set_inode_lock_class(sd, inode);
init(inode);
- d_instantiate(dentry, inode);
- if (S_ISDIR(mode) || S_ISLNK(mode))
+ if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ /*
+ * ->symlink(), ->mkdir(), configfs_register_subsystem() or
+ * create_default_group() - already hashed.
+ */
+ d_instantiate(dentry, inode);
dget(dentry); /* pin link and directory dentries in core */
+ } else {
+ /* ->lookup() */
+ d_add(dentry, inode);
+ }
return error;
}
@@ -218,7 +228,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd)
if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
return sd->s_dentry->d_name.name;
- if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+ if (sd->s_type & (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)) {
attr = sd->s_element;
return attr->ca_name;
}
@@ -255,7 +265,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
/* no inode means this hasn't been made visible yet */
return;
- mutex_lock(&d_inode(dir)->i_mutex);
+ inode_lock(d_inode(dir));
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
if (!sd->s_element)
continue;
@@ -268,5 +278,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
break;
}
}
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
}
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index b863a09cd2f1..8b2a994042dd 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -182,6 +182,7 @@ void config_group_init(struct config_group *group)
{
config_item_init(&group->cg_item);
INIT_LIST_HEAD(&group->cg_children);
+ INIT_LIST_HEAD(&group->default_groups);
}
EXPORT_SYMBOL(config_group_init);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index a8f3b589a2df..cfd91320e869 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -71,8 +71,8 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
struct inode *inode;
struct dentry *root;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = CONFIGFS_MAGIC;
sb->s_op = &configfs_ops;
sb->s_time_gran = 1;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index ec5c8325b503..db6d69289608 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -279,27 +279,33 @@ static int configfs_getlink(struct dentry *dentry, char * path)
}
-static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *configfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- unsigned long page = get_zeroed_page(GFP_KERNEL);
+ char *body;
int error;
- if (!page)
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!body)
return ERR_PTR(-ENOMEM);
- error = configfs_getlink(dentry, (char *)page);
+ error = configfs_getlink(dentry, body);
if (!error) {
- return *cookie = (void *)page;
+ set_delayed_call(done, kfree_link, body);
+ return body;
}
- free_page(page);
+ kfree(body);
return ERR_PTR(error);
}
const struct inode_operations configfs_symlink_inode_operations = {
- .follow_link = configfs_follow_link,
+ .get_link = configfs_get_link,
.readlink = generic_readlink,
- .put_link = free_page_put_link,
.setattr = configfs_setattr,
};
diff --git a/fs/coredump.c b/fs/coredump.c
index a8f75640ac86..47c32c3bfa1d 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,10 @@
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/timekeeping.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -117,6 +121,26 @@ int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
ret = cn_vprintf(cn, fmt, arg);
va_end(arg);
+ if (ret == 0) {
+ /*
+ * Ensure that this coredump name component can't cause the
+ * resulting corefile path to consist of a ".." or ".".
+ */
+ if ((cn->used - cur == 1 && cn->corename[cur] == '.') ||
+ (cn->used - cur == 2 && cn->corename[cur] == '.'
+ && cn->corename[cur+1] == '.'))
+ cn->corename[cur] = '!';
+
+ /*
+ * Empty names are fishy and could be used to create a "//" in a
+ * corefile name, causing the coredump to happen one directory
+ * level too high. Enforce that all components of the core
+ * pattern are at least one character long.
+ */
+ if (cn->used == cur)
+ ret = cn_printf(cn, "!");
+ }
+
for (; cur < cn->used; ++cur) {
if (cn->corename[cur] == '/')
cn->corename[cur] = '!';
@@ -232,9 +256,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
break;
/* UNIX time of coredump */
case 't': {
- struct timeval tv;
- do_gettimeofday(&tv);
- err = cn_printf(cn, "%lu", tv.tv_sec);
+ time64_t time;
+
+ time = ktime_get_real_seconds();
+ err = cn_printf(cn, "%lld", time);
break;
}
/* hostname */
@@ -280,23 +305,24 @@ out:
return ispipe;
}
-static int zap_process(struct task_struct *start, int exit_code)
+static int zap_process(struct task_struct *start, int exit_code, int flags)
{
struct task_struct *t;
int nr = 0;
+ /* ignore all signals except SIGKILL, see prepare_signal() */
+ start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
- t = start;
- do {
+ for_each_thread(start, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
if (t != current && t->mm) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
nr++;
}
- } while_each_thread(start, t);
+ }
return nr;
}
@@ -311,10 +337,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
mm->core_state = core_state;
- nr = zap_process(tsk, exit_code);
tsk->signal->group_exit_task = tsk;
- /* ignore all signals except SIGKILL, see prepare_signal() */
- tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+ nr = zap_process(tsk, exit_code, 0);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
}
spin_unlock_irq(&tsk->sighand->siglock);
@@ -360,18 +384,18 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
continue;
if (g->flags & PF_KTHREAD)
continue;
- p = g;
- do {
- if (p->mm) {
- if (unlikely(p->mm == mm)) {
- lock_task_sighand(p, &flags);
- nr += zap_process(p, exit_code);
- p->signal->flags = SIGNAL_GROUP_EXIT;
- unlock_task_sighand(p, &flags);
- }
- break;
+
+ for_each_thread(g, p) {
+ if (unlikely(!p->mm))
+ continue;
+ if (unlikely(p->mm == mm)) {
+ lock_task_sighand(p, &flags);
+ nr += zap_process(p, exit_code,
+ SIGNAL_GROUP_EXIT);
+ unlock_task_sighand(p, &flags);
}
- } while_each_thread(g, p);
+ break;
+ }
}
rcu_read_unlock();
done:
@@ -628,6 +652,8 @@ void do_coredump(const siginfo_t *siginfo)
}
} else {
struct inode *inode;
+ int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
+ O_LARGEFILE | O_EXCL;
if (cprm.limit < binfmt->min_coredump)
goto fail_unlock;
@@ -666,10 +692,27 @@ void do_coredump(const siginfo_t *siginfo)
* what matters is that at least one of the two processes
* writes its coredump successfully, not which one.
*/
- cprm.file = filp_open(cn.corename,
- O_CREAT | 2 | O_NOFOLLOW |
- O_LARGEFILE | O_EXCL,
- 0600);
+ if (need_suid_safe) {
+ /*
+ * Using user namespaces, normal user tasks can change
+ * their current->fs->root to point to arbitrary
+ * directories. Since the intention of the "only dump
+ * with a fully qualified path" rule is to control where
+ * coredumps may be placed using root privileges,
+ * current->fs->root must not be used. Instead, use the
+ * root directory of init_task.
+ */
+ struct path root;
+
+ task_lock(&init_task);
+ get_fs_root(init_task.fs, &root);
+ task_unlock(&init_task);
+ cprm.file = file_open_root(root.dentry, root.mnt,
+ cn.corename, open_flags, 0600);
+ path_put(&root);
+ } else {
+ cprm.file = filp_open(cn.corename, open_flags, 0600);
+ }
if (IS_ERR(cprm.file))
goto fail_unlock;
diff --git a/fs/cramfs/README b/fs/cramfs/README
index 445d1c2d7646..9d4e7ea311f4 100644
--- a/fs/cramfs/README
+++ b/fs/cramfs/README
@@ -86,26 +86,26 @@ Block Size
(Block size in cramfs refers to the size of input data that is
compressed at a time. It's intended to be somewhere around
-PAGE_CACHE_SIZE for cramfs_readpage's convenience.)
+PAGE_SIZE for cramfs_readpage's convenience.)
The superblock ought to indicate the block size that the fs was
written for, since comments in <linux/pagemap.h> indicate that
-PAGE_CACHE_SIZE may grow in future (if I interpret the comment
+PAGE_SIZE may grow in future (if I interpret the comment
correctly).
-Currently, mkcramfs #define's PAGE_CACHE_SIZE as 4096 and uses that
-for blksize, whereas Linux-2.3.39 uses its PAGE_CACHE_SIZE, which in
+Currently, mkcramfs #define's PAGE_SIZE as 4096 and uses that
+for blksize, whereas Linux-2.3.39 uses its PAGE_SIZE, which in
turn is defined as PAGE_SIZE (which can be as large as 32KB on arm).
This discrepancy is a bug, though it's not clear which should be
changed.
-One option is to change mkcramfs to take its PAGE_CACHE_SIZE from
+One option is to change mkcramfs to take its PAGE_SIZE from
<asm/page.h>. Personally I don't like this option, but it does
require the least amount of change: just change `#define
-PAGE_CACHE_SIZE (4096)' to `#include <asm/page.h>'. The disadvantage
+PAGE_SIZE (4096)' to `#include <asm/page.h>'. The disadvantage
is that the generated cramfs cannot always be shared between different
kernels, not even necessarily kernels of the same architecture if
-PAGE_CACHE_SIZE is subject to change between kernel versions
+PAGE_SIZE is subject to change between kernel versions
(currently possible with arm and ia64).
The remaining options try to make cramfs more sharable.
@@ -126,22 +126,22 @@ size. The options are:
1. Always 4096 bytes.
2. Writer chooses blocksize; kernel adapts but rejects blocksize >
- PAGE_CACHE_SIZE.
+ PAGE_SIZE.
3. Writer chooses blocksize; kernel adapts even to blocksize >
- PAGE_CACHE_SIZE.
+ PAGE_SIZE.
It's easy enough to change the kernel to use a smaller value than
-PAGE_CACHE_SIZE: just make cramfs_readpage read multiple blocks.
+PAGE_SIZE: just make cramfs_readpage read multiple blocks.
-The cost of option 1 is that kernels with a larger PAGE_CACHE_SIZE
+The cost of option 1 is that kernels with a larger PAGE_SIZE
value don't get as good compression as they can.
The cost of option 2 relative to option 1 is that the code uses
variables instead of #define'd constants. The gain is that people
-with kernels having larger PAGE_CACHE_SIZE can make use of that if
+with kernels having larger PAGE_SIZE can make use of that if
they don't mind their cramfs being inaccessible to kernels with
-smaller PAGE_CACHE_SIZE values.
+smaller PAGE_SIZE values.
Option 3 is easy to implement if we don't mind being CPU-inefficient:
e.g. get readpage to decompress to a buffer of size MAX_BLKSIZE (which
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 355c522f3585..3a32ddf98095 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -100,6 +100,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &cramfs_aops;
break;
default:
@@ -136,7 +137,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
* page cache and dentry tree anyway..
*
* This also acts as a way to guarantee contiguous areas of up to
- * BLKS_PER_BUF*PAGE_CACHE_SIZE, so that the caller doesn't need to
+ * BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to
* worry about end-of-buffer issues even when decompressing a full
* page cache.
*/
@@ -151,7 +152,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
*/
#define BLKS_PER_BUF_SHIFT (2)
#define BLKS_PER_BUF (1 << BLKS_PER_BUF_SHIFT)
-#define BUFFER_SIZE (BLKS_PER_BUF*PAGE_CACHE_SIZE)
+#define BUFFER_SIZE (BLKS_PER_BUF*PAGE_SIZE)
static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE];
static unsigned buffer_blocknr[READ_BUFFERS];
@@ -172,8 +173,8 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
if (!len)
return NULL;
- blocknr = offset >> PAGE_CACHE_SHIFT;
- offset &= PAGE_CACHE_SIZE - 1;
+ blocknr = offset >> PAGE_SHIFT;
+ offset &= PAGE_SIZE - 1;
/* Check if an existing buffer already has the data.. */
for (i = 0; i < READ_BUFFERS; i++) {
@@ -183,14 +184,14 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
continue;
if (blocknr < buffer_blocknr[i])
continue;
- blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_CACHE_SHIFT;
+ blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT;
blk_offset += offset;
if (blk_offset + len > BUFFER_SIZE)
continue;
return read_buffers[i] + blk_offset;
}
- devsize = mapping->host->i_size >> PAGE_CACHE_SHIFT;
+ devsize = mapping->host->i_size >> PAGE_SHIFT;
/* Ok, read in BLKS_PER_BUF pages completely first. */
for (i = 0; i < BLKS_PER_BUF; i++) {
@@ -212,7 +213,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
wait_on_page_locked(page);
if (!PageUptodate(page)) {
/* asynchronous error */
- page_cache_release(page);
+ put_page(page);
pages[i] = NULL;
}
}
@@ -228,12 +229,12 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
struct page *page = pages[i];
if (page) {
- memcpy(data, kmap(page), PAGE_CACHE_SIZE);
+ memcpy(data, kmap(page), PAGE_SIZE);
kunmap(page);
- page_cache_release(page);
+ put_page(page);
} else
- memset(data, 0, PAGE_CACHE_SIZE);
- data += PAGE_CACHE_SIZE;
+ memset(data, 0, PAGE_SIZE);
+ data += PAGE_SIZE;
}
return read_buffers[buffer] + offset;
}
@@ -352,7 +353,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = CRAMFS_MAGIC;
- buf->f_bsize = PAGE_CACHE_SIZE;
+ buf->f_bsize = PAGE_SIZE;
buf->f_blocks = CRAMFS_SB(sb)->blocks;
buf->f_bfree = 0;
buf->f_bavail = 0;
@@ -495,7 +496,7 @@ static int cramfs_readpage(struct file *file, struct page *page)
int bytes_filled;
void *pgdata;
- maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
bytes_filled = 0;
pgdata = kmap(page);
@@ -515,14 +516,14 @@ static int cramfs_readpage(struct file *file, struct page *page)
if (compr_len == 0)
; /* hole */
- else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
+ else if (unlikely(compr_len > (PAGE_SIZE << 1))) {
pr_err("bad compressed blocksize %u\n",
compr_len);
goto err;
} else {
mutex_lock(&read_mutex);
bytes_filled = cramfs_uncompress_block(pgdata,
- PAGE_CACHE_SIZE,
+ PAGE_SIZE,
cramfs_read(sb, start_offset, compr_len),
compr_len);
mutex_unlock(&read_mutex);
@@ -531,7 +532,7 @@ static int cramfs_readpage(struct file *file, struct page *page)
}
}
- memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
+ memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
flush_dcache_page(page);
kunmap(page);
SetPageUptodate(page);
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
new file mode 100644
index 000000000000..92348faf9865
--- /dev/null
+++ b/fs/crypto/Kconfig
@@ -0,0 +1,18 @@
+config FS_ENCRYPTION
+ tristate "FS Encryption (Per-file encryption)"
+ depends on BLOCK
+ select CRYPTO
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_ECB
+ select CRYPTO_XTS
+ select CRYPTO_CTS
+ select CRYPTO_CTR
+ select CRYPTO_SHA256
+ select KEYS
+ select ENCRYPTED_KEYS
+ help
+ Enable encryption of files and directories. This
+ feature is similar to ecryptfs, but it is more memory
+ efficient since it avoids caching the encrypted and
+ decrypted pages in the page cache.
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
new file mode 100644
index 000000000000..f17684c48739
--- /dev/null
+++ b/fs/crypto/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o
+
+fscrypto-y := crypto.o fname.o policy.o keyinfo.o
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
new file mode 100644
index 000000000000..2fc8c43ce531
--- /dev/null
+++ b/fs/crypto/crypto.c
@@ -0,0 +1,568 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ * Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ * Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ * Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/fscrypto.h>
+#include <linux/ecryptfs.h>
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+ "Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+ "Number of crypto contexts to preallocate");
+
+static mempool_t *fscrypt_bounce_page_pool = NULL;
+
+static LIST_HEAD(fscrypt_free_ctxs);
+static DEFINE_SPINLOCK(fscrypt_ctx_lock);
+
+static struct workqueue_struct *fscrypt_read_workqueue;
+static DEFINE_MUTEX(fscrypt_init_mutex);
+
+static struct kmem_cache *fscrypt_ctx_cachep;
+struct kmem_cache *fscrypt_info_cachep;
+
+/**
+ * fscrypt_release_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+ unsigned long flags;
+
+ if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+ mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
+ ctx->w.bounce_page = NULL;
+ }
+ ctx->w.control_page = NULL;
+ if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+ kmem_cache_free(fscrypt_ctx_cachep, ctx);
+ } else {
+ spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+ list_add(&ctx->free_list, &fscrypt_free_ctxs);
+ spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+ }
+}
+EXPORT_SYMBOL(fscrypt_release_ctx);
+
+/**
+ * fscrypt_get_ctx() - Gets an encryption context
+ * @inode: The inode for which we are doing the crypto
+ * @gfp_flags: The gfp flag for memory allocation
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
+{
+ struct fscrypt_ctx *ctx = NULL;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ unsigned long flags;
+
+ if (ci == NULL)
+ return ERR_PTR(-ENOKEY);
+
+ /*
+ * We first try getting the ctx from a free list because in
+ * the common case the ctx will have an allocated and
+ * initialized crypto tfm, so it's probably a worthwhile
+ * optimization. For the bounce page, we first try getting it
+ * from the kernel allocator because that's just about as fast
+ * as getting it from a list and because a cache of free pages
+ * should generally be a "last resort" option for a filesystem
+ * to be able to do its job.
+ */
+ spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+ ctx = list_first_entry_or_null(&fscrypt_free_ctxs,
+ struct fscrypt_ctx, free_list);
+ if (ctx)
+ list_del(&ctx->free_list);
+ spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+ if (!ctx) {
+ ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+ ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ } else {
+ ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+ }
+ ctx->flags &= ~FS_WRITE_PATH_FL;
+ return ctx;
+}
+EXPORT_SYMBOL(fscrypt_get_ctx);
+
+/**
+ * fscrypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void fscrypt_complete(struct crypto_async_request *req, int res)
+{
+ struct fscrypt_completion_result *ecr = req->data;
+
+ if (res == -EINPROGRESS)
+ return;
+ ecr->res = res;
+ complete(&ecr->completion);
+}
+
+typedef enum {
+ FS_DECRYPT = 0,
+ FS_ENCRYPT,
+} fscrypt_direction_t;
+
+static int do_page_crypto(struct inode *inode,
+ fscrypt_direction_t rw, pgoff_t index,
+ struct page *src_page, struct page *dest_page,
+ gfp_t gfp_flags)
+{
+ u8 xts_tweak[FS_XTS_TWEAK_SIZE];
+ struct skcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
+ struct scatterlist dst, src;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
+ int res = 0;
+
+ req = skcipher_request_alloc(tfm, gfp_flags);
+ if (!req) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_request_alloc() failed\n",
+ __func__);
+ return -ENOMEM;
+ }
+
+ skcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ fscrypt_complete, &ecr);
+
+ BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index));
+ memcpy(xts_tweak, &index, sizeof(index));
+ memset(&xts_tweak[sizeof(index)], 0,
+ FS_XTS_TWEAK_SIZE - sizeof(index));
+
+ sg_init_table(&dst, 1);
+ sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
+ sg_init_table(&src, 1);
+ sg_set_page(&src, src_page, PAGE_SIZE, 0);
+ skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE,
+ xts_tweak);
+ if (rw == FS_DECRYPT)
+ res = crypto_skcipher_decrypt(req);
+ else
+ res = crypto_skcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ BUG_ON(req->base.data != &ecr);
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+ skcipher_request_free(req);
+ if (res) {
+ printk_ratelimited(KERN_ERR
+ "%s: crypto_skcipher_encrypt() returned %d\n",
+ __func__, res);
+ return res;
+ }
+ return 0;
+}
+
+static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
+{
+ ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
+ if (ctx->w.bounce_page == NULL)
+ return ERR_PTR(-ENOMEM);
+ ctx->flags |= FS_WRITE_PATH_FL;
+ return ctx->w.bounce_page;
+}
+
+/**
+ * fscypt_encrypt_page() - Encrypts a page
+ * @inode: The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ * @gfp_flags: The gfp flag for memory allocation
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path. The caller must call
+ * fscrypt_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *fscrypt_encrypt_page(struct inode *inode,
+ struct page *plaintext_page, gfp_t gfp_flags)
+{
+ struct fscrypt_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ int err;
+
+ BUG_ON(!PageLocked(plaintext_page));
+
+ ctx = fscrypt_get_ctx(inode, gfp_flags);
+ if (IS_ERR(ctx))
+ return (struct page *)ctx;
+
+ /* The encryption operation will require a bounce page. */
+ ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
+ if (IS_ERR(ciphertext_page))
+ goto errout;
+
+ ctx->w.control_page = plaintext_page;
+ err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
+ plaintext_page, ciphertext_page,
+ gfp_flags);
+ if (err) {
+ ciphertext_page = ERR_PTR(err);
+ goto errout;
+ }
+ SetPagePrivate(ciphertext_page);
+ set_page_private(ciphertext_page, (unsigned long)ctx);
+ lock_page(ciphertext_page);
+ return ciphertext_page;
+
+errout:
+ fscrypt_release_ctx(ctx);
+ return ciphertext_page;
+}
+EXPORT_SYMBOL(fscrypt_encrypt_page);
+
+/**
+ * f2crypt_decrypt_page() - Decrypts a page in-place
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_decrypt_page(struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+
+ return do_page_crypto(page->mapping->host,
+ FS_DECRYPT, page->index, page, page, GFP_NOFS);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_page);
+
+int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
+ sector_t pblk, unsigned int len)
+{
+ struct fscrypt_ctx *ctx;
+ struct page *ciphertext_page = NULL;
+ struct bio *bio;
+ int ret, err = 0;
+
+ BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
+
+ ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT);
+ if (IS_ERR(ciphertext_page)) {
+ err = PTR_ERR(ciphertext_page);
+ goto errout;
+ }
+
+ while (len--) {
+ err = do_page_crypto(inode, FS_ENCRYPT, lblk,
+ ZERO_PAGE(0), ciphertext_page,
+ GFP_NOFS);
+ if (err)
+ goto errout;
+
+ bio = bio_alloc(GFP_NOWAIT, 1);
+ if (!bio) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ bio->bi_bdev = inode->i_sb->s_bdev;
+ bio->bi_iter.bi_sector =
+ pblk << (inode->i_sb->s_blocksize_bits - 9);
+ ret = bio_add_page(bio, ciphertext_page,
+ inode->i_sb->s_blocksize, 0);
+ if (ret != inode->i_sb->s_blocksize) {
+ /* should never happen! */
+ WARN_ON(1);
+ bio_put(bio);
+ err = -EIO;
+ goto errout;
+ }
+ err = submit_bio_wait(WRITE, bio);
+ if ((err == 0) && bio->bi_error)
+ err = -EIO;
+ bio_put(bio);
+ if (err)
+ goto errout;
+ lblk++;
+ pblk++;
+ }
+ err = 0;
+errout:
+ fscrypt_release_ctx(ctx);
+ return err;
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct dentry *dir;
+ struct fscrypt_info *ci;
+ int dir_has_key, cached_with_key;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ dir = dget_parent(dentry);
+ if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) {
+ dput(dir);
+ return 0;
+ }
+
+ ci = d_inode(dir)->i_crypt_info;
+ if (ci && ci->ci_keyring_key &&
+ (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+ (1 << KEY_FLAG_REVOKED) |
+ (1 << KEY_FLAG_DEAD))))
+ ci = NULL;
+
+ /* this should eventually be an flag in d_flags */
+ spin_lock(&dentry->d_lock);
+ cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
+ spin_unlock(&dentry->d_lock);
+ dir_has_key = (ci != NULL);
+ dput(dir);
+
+ /*
+ * If the dentry was cached without the key, and it is a
+ * negative dentry, it might be a valid name. We can't check
+ * if the key has since been made available due to locking
+ * reasons, so we fail the validation so ext4_lookup() can do
+ * this check.
+ *
+ * We also fail the validation if the dentry was created with
+ * the key present, but we no longer have the key, or vice versa.
+ */
+ if ((!cached_with_key && d_is_negative(dentry)) ||
+ (!cached_with_key && dir_has_key) ||
+ (cached_with_key && !dir_has_key))
+ return 0;
+ return 1;
+}
+
+const struct dentry_operations fscrypt_d_ops = {
+ .d_revalidate = fscrypt_d_revalidate,
+};
+EXPORT_SYMBOL(fscrypt_d_ops);
+
+/*
+ * Call fscrypt_decrypt_page on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+ struct fscrypt_ctx *ctx =
+ container_of(work, struct fscrypt_ctx, r.work);
+ struct bio *bio = ctx->r.bio;
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+ int ret = fscrypt_decrypt_page(page);
+
+ if (ret) {
+ WARN_ON_ONCE(1);
+ SetPageError(page);
+ } else {
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ }
+ fscrypt_release_ctx(ctx);
+ bio_put(bio);
+}
+
+void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+{
+ INIT_WORK(&ctx->r.work, completion_pages);
+ ctx->r.bio = bio;
+ queue_work(fscrypt_read_workqueue, &ctx->r.work);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+
+void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+ struct fscrypt_ctx *ctx;
+ struct page *bounce_page;
+
+ /* The bounce data pages are unmapped. */
+ if ((*page)->mapping)
+ return;
+
+ /* The bounce data page is unmapped. */
+ bounce_page = *page;
+ ctx = (struct fscrypt_ctx *)page_private(bounce_page);
+
+ /* restore control page */
+ *page = ctx->w.control_page;
+
+ if (restore)
+ fscrypt_restore_control_page(bounce_page);
+}
+EXPORT_SYMBOL(fscrypt_pullback_bio_page);
+
+void fscrypt_restore_control_page(struct page *page)
+{
+ struct fscrypt_ctx *ctx;
+
+ ctx = (struct fscrypt_ctx *)page_private(page);
+ set_page_private(page, (unsigned long)NULL);
+ ClearPagePrivate(page);
+ unlock_page(page);
+ fscrypt_release_ctx(ctx);
+}
+EXPORT_SYMBOL(fscrypt_restore_control_page);
+
+static void fscrypt_destroy(void)
+{
+ struct fscrypt_ctx *pos, *n;
+
+ list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list)
+ kmem_cache_free(fscrypt_ctx_cachep, pos);
+ INIT_LIST_HEAD(&fscrypt_free_ctxs);
+ mempool_destroy(fscrypt_bounce_page_pool);
+ fscrypt_bounce_page_pool = NULL;
+}
+
+/**
+ * fscrypt_initialize() - allocate major buffers for fs encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_initialize(void)
+{
+ int i, res = -ENOMEM;
+
+ if (fscrypt_bounce_page_pool)
+ return 0;
+
+ mutex_lock(&fscrypt_init_mutex);
+ if (fscrypt_bounce_page_pool)
+ goto already_initialized;
+
+ for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+ struct fscrypt_ctx *ctx;
+
+ ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
+ if (!ctx)
+ goto fail;
+ list_add(&ctx->free_list, &fscrypt_free_ctxs);
+ }
+
+ fscrypt_bounce_page_pool =
+ mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+ if (!fscrypt_bounce_page_pool)
+ goto fail;
+
+already_initialized:
+ mutex_unlock(&fscrypt_init_mutex);
+ return 0;
+fail:
+ fscrypt_destroy();
+ mutex_unlock(&fscrypt_init_mutex);
+ return res;
+}
+EXPORT_SYMBOL(fscrypt_initialize);
+
+/**
+ * fscrypt_init() - Set up for fs encryption.
+ */
+static int __init fscrypt_init(void)
+{
+ fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue",
+ WQ_HIGHPRI, 0);
+ if (!fscrypt_read_workqueue)
+ goto fail;
+
+ fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT);
+ if (!fscrypt_ctx_cachep)
+ goto fail_free_queue;
+
+ fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
+ if (!fscrypt_info_cachep)
+ goto fail_free_ctx;
+
+ return 0;
+
+fail_free_ctx:
+ kmem_cache_destroy(fscrypt_ctx_cachep);
+fail_free_queue:
+ destroy_workqueue(fscrypt_read_workqueue);
+fail:
+ return -ENOMEM;
+}
+module_init(fscrypt_init)
+
+/**
+ * fscrypt_exit() - Shutdown the fs encryption system
+ */
+static void __exit fscrypt_exit(void)
+{
+ fscrypt_destroy();
+
+ if (fscrypt_read_workqueue)
+ destroy_workqueue(fscrypt_read_workqueue);
+ kmem_cache_destroy(fscrypt_ctx_cachep);
+ kmem_cache_destroy(fscrypt_info_cachep);
+}
+module_exit(fscrypt_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/f2fs/crypto_fname.c b/fs/crypto/fname.c
index ab377d496a39..5d6d49113efa 100644
--- a/fs/f2fs/crypto_fname.c
+++ b/fs/crypto/fname.c
@@ -1,46 +1,32 @@
/*
- * linux/fs/f2fs/crypto_fname.c
- *
- * Copied from linux/fs/ext4/crypto.c
+ * This contains functions for filename crypto management
*
* Copyright (C) 2015, Google, Inc.
* Copyright (C) 2015, Motorola Mobility
*
- * This contains functions for filename crypto management in f2fs
- *
* Written by Uday Savagaonkar, 2014.
- *
- * Adjust f2fs dentry structure
- * Jaegeuk Kim, 2015.
+ * Modified by Jaegeuk Kim, 2015.
*
* This has not yet undergone a rigorous security audit.
*/
-#include <crypto/hash.h>
-#include <crypto/sha.h>
+
#include <keys/encrypted-type.h>
#include <keys/user-type.h>
-#include <linux/crypto.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/random.h>
#include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-#include <linux/f2fs_fs.h>
#include <linux/ratelimit.h>
+#include <linux/fscrypto.h>
-#include "f2fs.h"
-#include "f2fs_crypto.h"
-#include "xattr.h"
+static u32 size_round_up(size_t size, size_t blksize)
+{
+ return ((size + blksize - 1) / blksize) * blksize;
+}
/**
- * f2fs_dir_crypt_complete() -
+ * dir_crypt_complete() -
*/
-static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
+static void dir_crypt_complete(struct crypto_async_request *req, int res)
{
- struct f2fs_completion_result *ecr = req->data;
+ struct fscrypt_completion_result *ecr = req->data;
if (res == -EINPROGRESS)
return;
@@ -48,45 +34,35 @@ static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
complete(&ecr->completion);
}
-bool f2fs_valid_filenames_enc_mode(uint32_t mode)
-{
- return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS);
-}
-
-static unsigned max_name_len(struct inode *inode)
-{
- return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
- F2FS_NAME_LEN;
-}
-
/**
- * f2fs_fname_encrypt() -
+ * fname_encrypt() -
*
* This function encrypts the input filename, and returns the length of the
* ciphertext. Errors are returned as negative numbers. We trust the caller to
* allocate sufficient memory to oname string.
*/
-static int f2fs_fname_encrypt(struct inode *inode,
- const struct qstr *iname, struct f2fs_str *oname)
+static int fname_encrypt(struct inode *inode,
+ const struct qstr *iname, struct fscrypt_str *oname)
{
u32 ciphertext_len;
- struct ablkcipher_request *req = NULL;
- DECLARE_F2FS_COMPLETION_RESULT(ecr);
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ struct skcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
- char iv[F2FS_CRYPTO_BLOCK_SIZE];
+ char iv[FS_CRYPTO_BLOCK_SIZE];
struct scatterlist src_sg, dst_sg;
- int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
+ int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
char *workbuf, buf[32], *alloc_buf = NULL;
- unsigned lim = max_name_len(inode);
+ unsigned lim;
+ lim = inode->i_sb->s_cop->max_namelen(inode);
if (iname->len <= 0 || iname->len > lim)
return -EIO;
- ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ?
- F2FS_CRYPTO_BLOCK_SIZE : iname->len;
- ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding);
+ ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ?
+ FS_CRYPTO_BLOCK_SIZE : iname->len;
+ ciphertext_len = size_round_up(ciphertext_len, padding);
ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
if (ciphertext_len <= sizeof(buf)) {
@@ -99,16 +75,16 @@ static int f2fs_fname_encrypt(struct inode *inode,
}
/* Allocate request */
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
printk_ratelimited(KERN_ERR
"%s: crypto_request_alloc() failed\n", __func__);
kfree(alloc_buf);
return -ENOMEM;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- f2fs_dir_crypt_complete, &ecr);
+ dir_crypt_complete, &ecr);
/* Copy the input */
memcpy(workbuf, iname->name, iname->len);
@@ -116,79 +92,78 @@ static int f2fs_fname_encrypt(struct inode *inode,
memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
/* Initialize IV */
- memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
/* Create encryption request */
sg_init_one(&src_sg, workbuf, ciphertext_len);
sg_init_one(&dst_sg, oname->name, ciphertext_len);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
- res = crypto_ablkcipher_encrypt(req);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+ res = crypto_skcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
kfree(alloc_buf);
- ablkcipher_request_free(req);
- if (res < 0) {
+ skcipher_request_free(req);
+ if (res < 0)
printk_ratelimited(KERN_ERR
"%s: Error (error code %d)\n", __func__, res);
- }
+
oname->len = ciphertext_len;
return res;
}
/*
- * f2fs_fname_decrypt()
+ * fname_decrypt()
* This function decrypts the input filename, and returns
* the length of the plaintext.
* Errors are returned as negative numbers.
* We trust the caller to allocate sufficient memory to oname string.
*/
-static int f2fs_fname_decrypt(struct inode *inode,
- const struct f2fs_str *iname, struct f2fs_str *oname)
+static int fname_decrypt(struct inode *inode,
+ const struct fscrypt_str *iname,
+ struct fscrypt_str *oname)
{
- struct ablkcipher_request *req = NULL;
- DECLARE_F2FS_COMPLETION_RESULT(ecr);
+ struct skcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
struct scatterlist src_sg, dst_sg;
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
- char iv[F2FS_CRYPTO_BLOCK_SIZE];
- unsigned lim = max_name_len(inode);
+ char iv[FS_CRYPTO_BLOCK_SIZE];
+ unsigned lim;
+ lim = inode->i_sb->s_cop->max_namelen(inode);
if (iname->len <= 0 || iname->len > lim)
return -EIO;
/* Allocate request */
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
printk_ratelimited(KERN_ERR
"%s: crypto_request_alloc() failed\n", __func__);
return -ENOMEM;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- f2fs_dir_crypt_complete, &ecr);
+ dir_crypt_complete, &ecr);
/* Initialize IV */
- memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
+ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
/* Create decryption request */
sg_init_one(&src_sg, iname->name, iname->len);
sg_init_one(&dst_sg, oname->name, oname->len);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
- res = crypto_ablkcipher_decrypt(req);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+ res = crypto_skcipher_decrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
- ablkcipher_request_free(req);
+ skcipher_request_free(req);
if (res < 0) {
printk_ratelimited(KERN_ERR
- "%s: Error in f2fs_fname_decrypt (error code %d)\n",
- __func__, res);
+ "%s: Error (error code %d)\n", __func__, res);
return res;
}
@@ -200,7 +175,7 @@ static const char *lookup_table =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
/**
- * f2fs_fname_encode_digest() -
+ * digest_encode() -
*
* Encodes the input digest using characters from the set [a-zA-Z0-9_+].
* The encoded string is roughly 4/3 times the size of the input string.
@@ -249,148 +224,152 @@ static int digest_decode(const char *src, int len, char *dst)
return cp - dst;
}
-/**
- * f2fs_fname_crypto_round_up() -
- *
- * Return: The next multiple of block size
- */
-u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize)
+u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
{
- return ((size + blksize - 1) / blksize) * blksize;
+ int padding = 32;
+ struct fscrypt_info *ci = inode->i_crypt_info;
+
+ if (ci)
+ padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
+ if (ilen < FS_CRYPTO_BLOCK_SIZE)
+ ilen = FS_CRYPTO_BLOCK_SIZE;
+ return size_round_up(ilen, padding);
}
+EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
/**
- * f2fs_fname_crypto_alloc_obuff() -
+ * fscrypt_fname_crypto_alloc_obuff() -
*
* Allocates an output buffer that is sufficient for the crypto operation
* specified by the context and the direction.
*/
-int f2fs_fname_crypto_alloc_buffer(struct inode *inode,
- u32 ilen, struct f2fs_str *crypto_str)
+int fscrypt_fname_alloc_buffer(struct inode *inode,
+ u32 ilen, struct fscrypt_str *crypto_str)
{
- unsigned int olen;
- int padding = 16;
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
+ unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
- if (ci)
- padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
- if (padding < F2FS_CRYPTO_BLOCK_SIZE)
- padding = F2FS_CRYPTO_BLOCK_SIZE;
- olen = f2fs_fname_crypto_round_up(ilen, padding);
crypto_str->len = olen;
- if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
- olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
- /* Allocated buffer can hold one more character to null-terminate the
- * string */
+ if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
+ olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+ /*
+ * Allocated buffer can hold one more character to null-terminate the
+ * string
+ */
crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
if (!(crypto_str->name))
return -ENOMEM;
return 0;
}
+EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);
/**
- * f2fs_fname_crypto_free_buffer() -
+ * fscrypt_fname_crypto_free_buffer() -
*
* Frees the buffer allocated for crypto operation.
*/
-void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str)
+void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
{
if (!crypto_str)
return;
kfree(crypto_str->name);
crypto_str->name = NULL;
}
+EXPORT_SYMBOL(fscrypt_fname_free_buffer);
/**
- * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space
+ * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user
+ * space
*/
-int f2fs_fname_disk_to_usr(struct inode *inode,
- f2fs_hash_t *hash,
- const struct f2fs_str *iname,
- struct f2fs_str *oname)
+int fscrypt_fname_disk_to_usr(struct inode *inode,
+ u32 hash, u32 minor_hash,
+ const struct fscrypt_str *iname,
+ struct fscrypt_str *oname)
{
const struct qstr qname = FSTR_TO_QSTR(iname);
char buf[24];
int ret;
- if (is_dot_dotdot(&qname)) {
+ if (fscrypt_is_dot_dotdot(&qname)) {
oname->name[0] = '.';
oname->name[iname->len - 1] = '.';
oname->len = iname->len;
return oname->len;
}
- if (F2FS_I(inode)->i_crypt_info)
- return f2fs_fname_decrypt(inode, iname, oname);
+ if (iname->len < FS_CRYPTO_BLOCK_SIZE)
+ return -EUCLEAN;
- if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) {
+ if (inode->i_crypt_info)
+ return fname_decrypt(inode, iname, oname);
+
+ if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
ret = digest_encode(iname->name, iname->len, oname->name);
oname->len = ret;
return ret;
}
if (hash) {
- memcpy(buf, hash, 4);
- memset(buf + 4, 0, 4);
- } else
+ memcpy(buf, &hash, 4);
+ memcpy(buf + 4, &minor_hash, 4);
+ } else {
memset(buf, 0, 8);
+ }
memcpy(buf + 8, iname->name + iname->len - 16, 16);
oname->name[0] = '_';
ret = digest_encode(buf, 24, oname->name + 1);
oname->len = ret + 1;
return ret + 1;
}
+EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
/**
- * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space
+ * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
+ * space
*/
-int f2fs_fname_usr_to_disk(struct inode *inode,
+int fscrypt_fname_usr_to_disk(struct inode *inode,
const struct qstr *iname,
- struct f2fs_str *oname)
+ struct fscrypt_str *oname)
{
- int res;
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
- if (is_dot_dotdot(iname)) {
+ if (fscrypt_is_dot_dotdot(iname)) {
oname->name[0] = '.';
oname->name[iname->len - 1] = '.';
oname->len = iname->len;
return oname->len;
}
-
- if (ci) {
- res = f2fs_fname_encrypt(inode, iname, oname);
- return res;
- }
- /* Without a proper key, a user is not allowed to modify the filenames
+ if (inode->i_crypt_info)
+ return fname_encrypt(inode, iname, oname);
+ /*
+ * Without a proper key, a user is not allowed to modify the filenames
* in a directory. Consequently, a user space name cannot be mapped to
- * a disk-space name */
+ * a disk-space name
+ */
return -EACCES;
}
+EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
-int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
- int lookup, struct f2fs_filename *fname)
+int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
+ int lookup, struct fscrypt_name *fname)
{
- struct f2fs_crypt_info *ci;
int ret = 0, bigname = 0;
- memset(fname, 0, sizeof(struct f2fs_filename));
+ memset(fname, 0, sizeof(struct fscrypt_name));
fname->usr_fname = iname;
- if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) {
+ if (!dir->i_sb->s_cop->is_encrypted(dir) ||
+ fscrypt_is_dot_dotdot(iname)) {
fname->disk_name.name = (unsigned char *)iname->name;
fname->disk_name.len = iname->len;
return 0;
}
- ret = f2fs_get_encryption_info(dir);
- if (ret)
+ ret = get_crypt_info(dir);
+ if (ret && ret != -EOPNOTSUPP)
return ret;
- ci = F2FS_I(dir)->i_crypt_info;
- if (ci) {
- ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len,
- &fname->crypto_buf);
+
+ if (dir->i_crypt_info) {
+ ret = fscrypt_fname_alloc_buffer(dir, iname->len,
+ &fname->crypto_buf);
if (ret < 0)
return ret;
- ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf);
+ ret = fname_encrypt(dir, iname, &fname->crypto_buf);
if (ret < 0)
goto errout;
fname->disk_name.name = fname->crypto_buf.name;
@@ -400,18 +379,19 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
if (!lookup)
return -EACCES;
- /* We don't have the key and we are doing a lookup; decode the
+ /*
+ * We don't have the key and we are doing a lookup; decode the
* user-supplied name
*/
if (iname->name[0] == '_')
bigname = 1;
- if ((bigname && (iname->len != 33)) ||
- (!bigname && (iname->len > 43)))
+ if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
return -ENOENT;
fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
if (fname->crypto_buf.name == NULL)
return -ENOMEM;
+
ret = digest_decode(iname->name + bigname, iname->len - bigname,
fname->crypto_buf.name);
if (ret < 0) {
@@ -421,20 +401,24 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
fname->crypto_buf.len = ret;
if (bigname) {
memcpy(&fname->hash, fname->crypto_buf.name, 4);
+ memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
} else {
fname->disk_name.name = fname->crypto_buf.name;
fname->disk_name.len = fname->crypto_buf.len;
}
return 0;
+
errout:
- f2fs_fname_crypto_free_buffer(&fname->crypto_buf);
+ fscrypt_fname_free_buffer(&fname->crypto_buf);
return ret;
}
+EXPORT_SYMBOL(fscrypt_setup_filename);
-void f2fs_fname_free_filename(struct f2fs_filename *fname)
+void fscrypt_free_filename(struct fscrypt_name *fname)
{
kfree(fname->crypto_buf.name);
fname->crypto_buf.name = NULL;
fname->usr_fname = NULL;
fname->disk_name.name = NULL;
}
+EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
new file mode 100644
index 000000000000..06f5aa478bf2
--- /dev/null
+++ b/fs/crypto/keyinfo.c
@@ -0,0 +1,272 @@
+/*
+ * key management facility for FS encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+#include <linux/fscrypto.h>
+
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+ struct fscrypt_completion_result *ecr = req->data;
+
+ if (rc == -EINPROGRESS)
+ return;
+
+ ecr->res = rc;
+ complete(&ecr->completion);
+}
+
+/**
+ * derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivation.
+ * @source_key: Source key to which to apply derivation.
+ * @derived_key: Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
+ u8 source_key[FS_AES_256_XTS_KEY_SIZE],
+ u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+{
+ int res = 0;
+ struct skcipher_request *req = NULL;
+ DECLARE_FS_COMPLETION_RESULT(ecr);
+ struct scatterlist src_sg, dst_sg;
+ struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
+
+ if (IS_ERR(tfm)) {
+ res = PTR_ERR(tfm);
+ tfm = NULL;
+ goto out;
+ }
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
+ if (!req) {
+ res = -ENOMEM;
+ goto out;
+ }
+ skcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ derive_crypt_complete, &ecr);
+ res = crypto_skcipher_setkey(tfm, deriving_key,
+ FS_AES_128_ECB_KEY_SIZE);
+ if (res < 0)
+ goto out;
+
+ sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
+ sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ FS_AES_256_XTS_KEY_SIZE, NULL);
+ res = crypto_skcipher_encrypt(req);
+ if (res == -EINPROGRESS || res == -EBUSY) {
+ wait_for_completion(&ecr.completion);
+ res = ecr.res;
+ }
+out:
+ skcipher_request_free(req);
+ crypto_free_skcipher(tfm);
+ return res;
+}
+
+static void put_crypt_info(struct fscrypt_info *ci)
+{
+ if (!ci)
+ return;
+
+ key_put(ci->ci_keyring_key);
+ crypto_free_skcipher(ci->ci_ctfm);
+ kmem_cache_free(fscrypt_info_cachep, ci);
+}
+
+int get_crypt_info(struct inode *inode)
+{
+ struct fscrypt_info *crypt_info;
+ u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
+ (FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
+ struct key *keyring_key = NULL;
+ struct fscrypt_key *master_key;
+ struct fscrypt_context ctx;
+ const struct user_key_payload *ukp;
+ struct crypto_skcipher *ctfm;
+ const char *cipher_str;
+ u8 raw_key[FS_MAX_KEY_SIZE];
+ u8 mode;
+ int res;
+
+ res = fscrypt_initialize();
+ if (res)
+ return res;
+
+ if (!inode->i_sb->s_cop->get_context)
+ return -EOPNOTSUPP;
+retry:
+ crypt_info = ACCESS_ONCE(inode->i_crypt_info);
+ if (crypt_info) {
+ if (!crypt_info->ci_keyring_key ||
+ key_validate(crypt_info->ci_keyring_key) == 0)
+ return 0;
+ fscrypt_put_encryption_info(inode, crypt_info);
+ goto retry;
+ }
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res < 0) {
+ if (!fscrypt_dummy_context_enabled(inode))
+ return res;
+ ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+ ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+ ctx.flags = 0;
+ } else if (res != sizeof(ctx)) {
+ return -EINVAL;
+ }
+ res = 0;
+
+ crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
+ if (!crypt_info)
+ return -ENOMEM;
+
+ crypt_info->ci_flags = ctx.flags;
+ crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+ crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+ crypt_info->ci_ctfm = NULL;
+ crypt_info->ci_keyring_key = NULL;
+ memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+ sizeof(crypt_info->ci_master_key));
+ if (S_ISREG(inode->i_mode))
+ mode = crypt_info->ci_data_mode;
+ else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ mode = crypt_info->ci_filename_mode;
+ else
+ BUG();
+
+ switch (mode) {
+ case FS_ENCRYPTION_MODE_AES_256_XTS:
+ cipher_str = "xts(aes)";
+ break;
+ case FS_ENCRYPTION_MODE_AES_256_CTS:
+ cipher_str = "cts(cbc(aes))";
+ break;
+ default:
+ printk_once(KERN_WARNING
+ "%s: unsupported key mode %d (ino %u)\n",
+ __func__, mode, (unsigned) inode->i_ino);
+ res = -ENOKEY;
+ goto out;
+ }
+ if (fscrypt_dummy_context_enabled(inode)) {
+ memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
+ goto got_key;
+ }
+ memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX,
+ FS_KEY_DESC_PREFIX_SIZE);
+ sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE,
+ "%*phN", FS_KEY_DESCRIPTOR_SIZE,
+ ctx.master_key_descriptor);
+ full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
+ (2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0';
+ keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+ if (IS_ERR(keyring_key)) {
+ res = PTR_ERR(keyring_key);
+ keyring_key = NULL;
+ goto out;
+ }
+ crypt_info->ci_keyring_key = keyring_key;
+ if (keyring_key->type != &key_type_logon) {
+ printk_once(KERN_WARNING
+ "%s: key type must be logon\n", __func__);
+ res = -ENOKEY;
+ goto out;
+ }
+ down_read(&keyring_key->sem);
+ ukp = user_key_payload(keyring_key);
+ if (ukp->datalen != sizeof(struct fscrypt_key)) {
+ res = -EINVAL;
+ up_read(&keyring_key->sem);
+ goto out;
+ }
+ master_key = (struct fscrypt_key *)ukp->data;
+ BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
+
+ if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+ printk_once(KERN_WARNING
+ "%s: key size incorrect: %d\n",
+ __func__, master_key->size);
+ res = -ENOKEY;
+ up_read(&keyring_key->sem);
+ goto out;
+ }
+ res = derive_key_aes(ctx.nonce, master_key->raw, raw_key);
+ up_read(&keyring_key->sem);
+ if (res)
+ goto out;
+got_key:
+ ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
+ if (!ctfm || IS_ERR(ctfm)) {
+ res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+ printk(KERN_DEBUG
+ "%s: error %d (inode %u) allocating crypto tfm\n",
+ __func__, res, (unsigned) inode->i_ino);
+ goto out;
+ }
+ crypt_info->ci_ctfm = ctfm;
+ crypto_skcipher_clear_flags(ctfm, ~0);
+ crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode));
+ if (res)
+ goto out;
+
+ memzero_explicit(raw_key, sizeof(raw_key));
+ if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) {
+ put_crypt_info(crypt_info);
+ goto retry;
+ }
+ return 0;
+
+out:
+ if (res == -ENOKEY)
+ res = 0;
+ put_crypt_info(crypt_info);
+ memzero_explicit(raw_key, sizeof(raw_key));
+ return res;
+}
+
+void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
+{
+ struct fscrypt_info *prev;
+
+ if (ci == NULL)
+ ci = ACCESS_ONCE(inode->i_crypt_info);
+ if (ci == NULL)
+ return;
+
+ prev = cmpxchg(&inode->i_crypt_info, ci, NULL);
+ if (prev != ci)
+ return;
+
+ put_crypt_info(ci);
+}
+EXPORT_SYMBOL(fscrypt_put_encryption_info);
+
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+ struct fscrypt_info *ci = inode->i_crypt_info;
+
+ if (!ci ||
+ (ci->ci_keyring_key &&
+ (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+ (1 << KEY_FLAG_REVOKED) |
+ (1 << KEY_FLAG_DEAD)))))
+ return get_crypt_info(inode);
+ return 0;
+}
+EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
new file mode 100644
index 000000000000..0f9961eede1e
--- /dev/null
+++ b/fs/crypto/policy.c
@@ -0,0 +1,229 @@
+/*
+ * Encryption policy functions for per-file encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/fscrypto.h>
+
+static int inode_has_encryption_context(struct inode *inode)
+{
+ if (!inode->i_sb->s_cop->get_context)
+ return 0;
+ return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0);
+}
+
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int is_encryption_context_consistent_with_policy(struct inode *inode,
+ const struct fscrypt_policy *policy)
+{
+ struct fscrypt_context ctx;
+ int res;
+
+ if (!inode->i_sb->s_cop->get_context)
+ return 0;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res != sizeof(ctx))
+ return 0;
+
+ return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (ctx.flags == policy->flags) &&
+ (ctx.contents_encryption_mode ==
+ policy->contents_encryption_mode) &&
+ (ctx.filenames_encryption_mode ==
+ policy->filenames_encryption_mode));
+}
+
+static int create_encryption_context_from_policy(struct inode *inode,
+ const struct fscrypt_policy *policy)
+{
+ struct fscrypt_context ctx;
+ int res;
+
+ if (!inode->i_sb->s_cop->set_context)
+ return -EOPNOTSUPP;
+
+ if (inode->i_sb->s_cop->prepare_context) {
+ res = inode->i_sb->s_cop->prepare_context(inode);
+ if (res)
+ return res;
+ }
+
+ ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+ memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
+
+ if (!fscrypt_valid_contents_enc_mode(
+ policy->contents_encryption_mode)) {
+ printk(KERN_WARNING
+ "%s: Invalid contents encryption mode %d\n", __func__,
+ policy->contents_encryption_mode);
+ return -EINVAL;
+ }
+
+ if (!fscrypt_valid_filenames_enc_mode(
+ policy->filenames_encryption_mode)) {
+ printk(KERN_WARNING
+ "%s: Invalid filenames encryption mode %d\n", __func__,
+ policy->filenames_encryption_mode);
+ return -EINVAL;
+ }
+
+ if (policy->flags & ~FS_POLICY_FLAGS_VALID)
+ return -EINVAL;
+
+ ctx.contents_encryption_mode = policy->contents_encryption_mode;
+ ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+ ctx.flags = policy->flags;
+ BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE);
+ get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+
+ return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
+}
+
+int fscrypt_process_policy(struct inode *inode,
+ const struct fscrypt_policy *policy)
+{
+ if (policy->version != 0)
+ return -EINVAL;
+
+ if (!inode_has_encryption_context(inode)) {
+ if (!inode->i_sb->s_cop->empty_dir)
+ return -EOPNOTSUPP;
+ if (!inode->i_sb->s_cop->empty_dir(inode))
+ return -ENOTEMPTY;
+ return create_encryption_context_from_policy(inode, policy);
+ }
+
+ if (is_encryption_context_consistent_with_policy(inode, policy))
+ return 0;
+
+ printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
+ __func__);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(fscrypt_process_policy);
+
+int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+{
+ struct fscrypt_context ctx;
+ int res;
+
+ if (!inode->i_sb->s_cop->get_context ||
+ !inode->i_sb->s_cop->is_encrypted(inode))
+ return -ENODATA;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res != sizeof(ctx))
+ return -ENODATA;
+ if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+ return -EINVAL;
+
+ policy->version = 0;
+ policy->contents_encryption_mode = ctx.contents_encryption_mode;
+ policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+ policy->flags = ctx.flags;
+ memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+ FS_KEY_DESCRIPTOR_SIZE);
+ return 0;
+}
+EXPORT_SYMBOL(fscrypt_get_policy);
+
+int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
+{
+ struct fscrypt_info *parent_ci, *child_ci;
+ int res;
+
+ if ((parent == NULL) || (child == NULL)) {
+ printk(KERN_ERR "parent %p child %p\n", parent, child);
+ BUG_ON(1);
+ }
+
+ /* no restrictions if the parent directory is not encrypted */
+ if (!parent->i_sb->s_cop->is_encrypted(parent))
+ return 1;
+ /* if the child directory is not encrypted, this is always a problem */
+ if (!parent->i_sb->s_cop->is_encrypted(child))
+ return 0;
+ res = fscrypt_get_encryption_info(parent);
+ if (res)
+ return 0;
+ res = fscrypt_get_encryption_info(child);
+ if (res)
+ return 0;
+ parent_ci = parent->i_crypt_info;
+ child_ci = child->i_crypt_info;
+ if (!parent_ci && !child_ci)
+ return 1;
+ if (!parent_ci || !child_ci)
+ return 0;
+
+ return (memcmp(parent_ci->ci_master_key,
+ child_ci->ci_master_key,
+ FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+ (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+ (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
+ (parent_ci->ci_flags == child_ci->ci_flags));
+}
+EXPORT_SYMBOL(fscrypt_has_permitted_context);
+
+/**
+ * fscrypt_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child: Child inode that inherits the context from @parent.
+ * @fs_data: private data given by FS.
+ * @preload: preload child i_crypt_info
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int fscrypt_inherit_context(struct inode *parent, struct inode *child,
+ void *fs_data, bool preload)
+{
+ struct fscrypt_context ctx;
+ struct fscrypt_info *ci;
+ int res;
+
+ if (!parent->i_sb->s_cop->set_context)
+ return -EOPNOTSUPP;
+
+ res = fscrypt_get_encryption_info(parent);
+ if (res < 0)
+ return res;
+
+ ci = parent->i_crypt_info;
+ if (ci == NULL)
+ return -ENOKEY;
+
+ ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+ if (fscrypt_dummy_context_enabled(parent)) {
+ ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+ ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+ ctx.flags = 0;
+ memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
+ res = 0;
+ } else {
+ ctx.contents_encryption_mode = ci->ci_data_mode;
+ ctx.filenames_encryption_mode = ci->ci_filename_mode;
+ ctx.flags = ci->ci_flags;
+ memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+ FS_KEY_DESCRIPTOR_SIZE);
+ }
+ get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+ res = parent->i_sb->s_cop->set_context(child, &ctx,
+ sizeof(ctx), fs_data);
+ if (res)
+ return res;
+ return preload ? fscrypt_get_encryption_info(child): 0;
+}
+EXPORT_SYMBOL(fscrypt_inherit_context);
diff --git a/fs/dax.c b/fs/dax.c
index a86d3cc2b389..75ba46d82a76 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,52 +24,91 @@
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
+#include <linux/pagevec.h>
#include <linux/pmem.h>
#include <linux/sched.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
+#include <linux/pfn_t.h>
+#include <linux/sizes.h>
-int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
- struct block_device *bdev = inode->i_sb->s_bdev;
- sector_t sector = block << (inode->i_blkbits - 9);
+ struct request_queue *q = bdev->bd_queue;
+ long rc = -EIO;
+
+ dax->addr = (void __pmem *) ERR_PTR(-EIO);
+ if (blk_queue_enter(q, true) != 0)
+ return rc;
+
+ rc = bdev_direct_access(bdev, dax);
+ if (rc < 0) {
+ dax->addr = (void __pmem *) ERR_PTR(rc);
+ blk_queue_exit(q);
+ return rc;
+ }
+ return rc;
+}
+
+static void dax_unmap_atomic(struct block_device *bdev,
+ const struct blk_dax_ctl *dax)
+{
+ if (IS_ERR(dax->addr))
+ return;
+ blk_queue_exit(bdev->bd_queue);
+}
+
+struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+{
+ struct page *page = alloc_pages(GFP_KERNEL, 0);
+ struct blk_dax_ctl dax = {
+ .size = PAGE_SIZE,
+ .sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
+ };
+ long rc;
+
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ rc = dax_map_atomic(bdev, &dax);
+ if (rc < 0)
+ return ERR_PTR(rc);
+ memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
+ dax_unmap_atomic(bdev, &dax);
+ return page;
+}
+
+/*
+ * dax_clear_sectors() is called from within transaction context from XFS,
+ * and hence this means the stack from this point must follow GFP_NOFS
+ * semantics for all operations.
+ */
+int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
+{
+ struct blk_dax_ctl dax = {
+ .sector = _sector,
+ .size = _size,
+ };
might_sleep();
do {
- void __pmem *addr;
- unsigned long pfn;
- long count;
+ long count, sz;
- count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+ count = dax_map_atomic(bdev, &dax);
if (count < 0)
return count;
- BUG_ON(size < count);
- while (count > 0) {
- unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
- if (pgsz > count)
- pgsz = count;
- clear_pmem(addr, pgsz);
- addr += pgsz;
- size -= pgsz;
- count -= pgsz;
- BUG_ON(pgsz & 511);
- sector += pgsz / 512;
- cond_resched();
- }
- } while (size);
+ sz = min_t(long, count, SZ_128K);
+ clear_pmem(dax.addr, sz);
+ dax.size -= sz;
+ dax.sector += sz / 512;
+ dax_unmap_atomic(bdev, &dax);
+ cond_resched();
+ } while (dax.size);
wmb_pmem();
return 0;
}
-EXPORT_SYMBOL_GPL(dax_clear_blocks);
-
-static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
- unsigned blkbits)
-{
- unsigned long pfn;
- sector_t sector = bh->b_blocknr << (blkbits - 9);
- return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
-}
+EXPORT_SYMBOL_GPL(dax_clear_sectors);
/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
@@ -100,19 +139,29 @@ static bool buffer_size_valid(struct buffer_head *bh)
return bh->b_state != 0;
}
+
+static sector_t to_sector(const struct buffer_head *bh,
+ const struct inode *inode)
+{
+ sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+
+ return sector;
+}
+
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
loff_t start, loff_t end, get_block_t get_block,
struct buffer_head *bh)
{
- ssize_t retval = 0;
- loff_t pos = start;
- loff_t max = start;
- loff_t bh_max = start;
- void __pmem *addr;
- bool hole = false;
- bool need_wmb = false;
-
- if (iov_iter_rw(iter) != WRITE)
+ loff_t pos = start, max = start, bh_max = start;
+ bool hole = false, need_wmb = false;
+ struct block_device *bdev = NULL;
+ int rw = iov_iter_rw(iter), rc;
+ long map_len = 0;
+ struct blk_dax_ctl dax = {
+ .addr = (void __pmem *) ERR_PTR(-EIO),
+ };
+
+ if (rw == READ)
end = min(end, i_size_read(inode));
while (pos < end) {
@@ -127,13 +176,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
if (pos == bh_max) {
bh->b_size = PAGE_ALIGN(end - pos);
bh->b_state = 0;
- retval = get_block(inode, block, bh,
- iov_iter_rw(iter) == WRITE);
- if (retval)
+ rc = get_block(inode, block, bh, rw == WRITE);
+ if (rc)
break;
if (!buffer_size_valid(bh))
bh->b_size = 1 << blkbits;
bh_max = pos - first + bh->b_size;
+ bdev = bh->b_bdev;
} else {
unsigned done = bh->b_size -
(bh_max - (pos - first));
@@ -141,45 +190,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
bh->b_size -= done;
}
- hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh);
+ hole = rw == READ && !buffer_written(bh);
if (hole) {
- addr = NULL;
size = bh->b_size - first;
} else {
- retval = dax_get_addr(bh, &addr, blkbits);
- if (retval < 0)
+ dax_unmap_atomic(bdev, &dax);
+ dax.sector = to_sector(bh, inode);
+ dax.size = bh->b_size;
+ map_len = dax_map_atomic(bdev, &dax);
+ if (map_len < 0) {
+ rc = map_len;
break;
+ }
if (buffer_unwritten(bh) || buffer_new(bh)) {
- dax_new_buf(addr, retval, first, pos,
- end);
+ dax_new_buf(dax.addr, map_len, first,
+ pos, end);
need_wmb = true;
}
- addr += first;
- size = retval - first;
+ dax.addr += first;
+ size = map_len - first;
}
max = min(pos + size, end);
}
if (iov_iter_rw(iter) == WRITE) {
- len = copy_from_iter_pmem(addr, max - pos, iter);
+ len = copy_from_iter_pmem(dax.addr, max - pos, iter);
need_wmb = true;
} else if (!hole)
- len = copy_to_iter((void __force *)addr, max - pos,
+ len = copy_to_iter((void __force *) dax.addr, max - pos,
iter);
else
len = iov_iter_zero(max - pos, iter);
- if (!len)
+ if (!len) {
+ rc = -EFAULT;
break;
+ }
pos += len;
- addr += len;
+ if (!IS_ERR(dax.addr))
+ dax.addr += len;
}
if (need_wmb)
wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
- return (pos == start) ? retval : pos - start;
+ return (pos == start) ? rc : pos - start;
}
/**
@@ -208,13 +265,14 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
loff_t end = pos + iov_iter_count(iter);
memset(&bh, 0, sizeof(bh));
+ bh.b_bdev = inode->i_sb->s_bdev;
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
struct address_space *mapping = inode->i_mapping;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = filemap_write_and_wait_range(mapping, pos, end - 1);
if (retval) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
goto out;
}
}
@@ -226,10 +284,15 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
retval = dax_io(inode, iter, pos, end, get_block, &bh);
if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
+
+ if (end_io) {
+ int err;
- if ((retval > 0) && end_io)
- end_io(iocb, pos, retval, bh.b_private);
+ err = end_io(iocb, pos, retval, bh.b_private);
+ if (err)
+ retval = err;
+ }
if (!(flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(inode);
@@ -260,7 +323,7 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return VM_FAULT_SIGBUS;
}
@@ -268,28 +331,231 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
return VM_FAULT_LOCKED;
}
-static int copy_user_bh(struct page *to, struct buffer_head *bh,
- unsigned blkbits, unsigned long vaddr)
+static int copy_user_bh(struct page *to, struct inode *inode,
+ struct buffer_head *bh, unsigned long vaddr)
{
- void __pmem *vfrom;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(bh, inode),
+ .size = bh->b_size,
+ };
+ struct block_device *bdev = bh->b_bdev;
void *vto;
- if (dax_get_addr(bh, &vfrom, blkbits) < 0)
- return -EIO;
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
vto = kmap_atomic(to);
- copy_user_page(vto, (void __force *)vfrom, vaddr, to);
+ copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
kunmap_atomic(vto);
+ dax_unmap_atomic(bdev, &dax);
return 0;
}
+#define NO_SECTOR -1
+#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
+
+static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
+ sector_t sector, bool pmd_entry, bool dirty)
+{
+ struct radix_tree_root *page_tree = &mapping->page_tree;
+ pgoff_t pmd_index = DAX_PMD_INDEX(index);
+ int type, error = 0;
+ void *entry;
+
+ WARN_ON_ONCE(pmd_entry && !dirty);
+ if (dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+ spin_lock_irq(&mapping->tree_lock);
+
+ entry = radix_tree_lookup(page_tree, pmd_index);
+ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
+ index = pmd_index;
+ goto dirty;
+ }
+
+ entry = radix_tree_lookup(page_tree, index);
+ if (entry) {
+ type = RADIX_DAX_TYPE(entry);
+ if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
+ type != RADIX_DAX_PMD)) {
+ error = -EIO;
+ goto unlock;
+ }
+
+ if (!pmd_entry || type == RADIX_DAX_PMD)
+ goto dirty;
+
+ /*
+ * We only insert dirty PMD entries into the radix tree. This
+ * means we don't need to worry about removing a dirty PTE
+ * entry and inserting a clean PMD entry, thus reducing the
+ * range we would flush with a follow-up fsync/msync call.
+ */
+ radix_tree_delete(&mapping->page_tree, index);
+ mapping->nrexceptional--;
+ }
+
+ if (sector == NO_SECTOR) {
+ /*
+ * This can happen during correct operation if our pfn_mkwrite
+ * fault raced against a hole punch operation. If this
+ * happens the pte that was hole punched will have been
+ * unmapped and the radix tree entry will have been removed by
+ * the time we are called, but the call will still happen. We
+ * will return all the way up to wp_pfn_shared(), where the
+ * pte_same() check will fail, eventually causing page fault
+ * to be retried by the CPU.
+ */
+ goto unlock;
+ }
+
+ error = radix_tree_insert(page_tree, index,
+ RADIX_DAX_ENTRY(sector, pmd_entry));
+ if (error)
+ goto unlock;
+
+ mapping->nrexceptional++;
+ dirty:
+ if (dirty)
+ radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ unlock:
+ spin_unlock_irq(&mapping->tree_lock);
+ return error;
+}
+
+static int dax_writeback_one(struct block_device *bdev,
+ struct address_space *mapping, pgoff_t index, void *entry)
+{
+ struct radix_tree_root *page_tree = &mapping->page_tree;
+ int type = RADIX_DAX_TYPE(entry);
+ struct radix_tree_node *node;
+ struct blk_dax_ctl dax;
+ void **slot;
+ int ret = 0;
+
+ spin_lock_irq(&mapping->tree_lock);
+ /*
+ * Regular page slots are stabilized by the page lock even
+ * without the tree itself locked. These unlocked entries
+ * need verification under the tree lock.
+ */
+ if (!__radix_tree_lookup(page_tree, index, &node, &slot))
+ goto unlock;
+ if (*slot != entry)
+ goto unlock;
+
+ /* another fsync thread may have already written back this entry */
+ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+ goto unlock;
+
+ if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+ ret = -EIO;
+ goto unlock;
+ }
+
+ dax.sector = RADIX_DAX_SECTOR(entry);
+ dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+ spin_unlock_irq(&mapping->tree_lock);
+
+ /*
+ * We cannot hold tree_lock while calling dax_map_atomic() because it
+ * eventually calls cond_resched().
+ */
+ ret = dax_map_atomic(bdev, &dax);
+ if (ret < 0)
+ return ret;
+
+ if (WARN_ON_ONCE(ret < dax.size)) {
+ ret = -EIO;
+ goto unmap;
+ }
+
+ wb_cache_pmem(dax.addr, dax.size);
+
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+ spin_unlock_irq(&mapping->tree_lock);
+ unmap:
+ dax_unmap_atomic(bdev, &dax);
+ return ret;
+
+ unlock:
+ spin_unlock_irq(&mapping->tree_lock);
+ return ret;
+}
+
+/*
+ * Flush the mapping to the persistent domain within the byte range of [start,
+ * end]. This is required by data integrity operations to ensure file data is
+ * on persistent storage prior to completion of the operation.
+ */
+int dax_writeback_mapping_range(struct address_space *mapping,
+ struct block_device *bdev, struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ pgoff_t start_index, end_index, pmd_index;
+ pgoff_t indices[PAGEVEC_SIZE];
+ struct pagevec pvec;
+ bool done = false;
+ int i, ret = 0;
+ void *entry;
+
+ if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
+ return -EIO;
+
+ if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
+ return 0;
+
+ start_index = wbc->range_start >> PAGE_SHIFT;
+ end_index = wbc->range_end >> PAGE_SHIFT;
+ pmd_index = DAX_PMD_INDEX(start_index);
+
+ rcu_read_lock();
+ entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
+ rcu_read_unlock();
+
+ /* see if the start of our range is covered by a PMD entry */
+ if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+ start_index = pmd_index;
+
+ tag_pages_for_writeback(mapping, start_index, end_index);
+
+ pagevec_init(&pvec, 0);
+ while (!done) {
+ pvec.nr = find_get_entries_tag(mapping, start_index,
+ PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
+ pvec.pages, indices);
+
+ if (pvec.nr == 0)
+ break;
+
+ for (i = 0; i < pvec.nr; i++) {
+ if (indices[i] > end_index) {
+ done = true;
+ break;
+ }
+
+ ret = dax_writeback_one(bdev, mapping, indices[i],
+ pvec.pages[i]);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ wmb_pmem();
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
+
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct address_space *mapping = inode->i_mapping;
- sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address;
- void __pmem *addr;
- unsigned long pfn;
+ struct address_space *mapping = inode->i_mapping;
+ struct block_device *bdev = bh->b_bdev;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(bh, inode),
+ .size = bh->b_size,
+ };
pgoff_t size;
int error;
@@ -308,20 +574,23 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
goto out;
}
- error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
- if (error < 0)
- goto out;
- if (error < PAGE_SIZE) {
- error = -EIO;
+ if (dax_map_atomic(bdev, &dax) < 0) {
+ error = PTR_ERR(dax.addr);
goto out;
}
if (buffer_unwritten(bh) || buffer_new(bh)) {
- clear_pmem(addr, PAGE_SIZE);
+ clear_pmem(dax.addr, PAGE_SIZE);
wmb_pmem();
}
+ dax_unmap_atomic(bdev, &dax);
- error = vm_insert_mixed(vma, vaddr, pfn);
+ error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
+ vmf->flags & FAULT_FLAG_WRITE);
+ if (error)
+ goto out;
+
+ error = vm_insert_mixed(vma, vaddr, dax.pfn);
out:
i_mmap_unlock_read(mapping);
@@ -366,18 +635,19 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
memset(&bh, 0, sizeof(bh));
block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+ bh.b_bdev = inode->i_sb->s_bdev;
bh.b_size = PAGE_SIZE;
repeat:
page = find_get_page(mapping, vmf->pgoff);
if (page) {
if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
- page_cache_release(page);
+ put_page(page);
return VM_FAULT_RETRY;
}
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -415,7 +685,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (vmf->cow_page) {
struct page *new_page = vmf->cow_page;
if (buffer_written(&bh))
- error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+ error = copy_user_bh(new_page, inode, &bh, vaddr);
else
clear_user_highpage(new_page, vaddr);
if (error)
@@ -441,10 +711,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (page) {
unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
- PAGE_CACHE_SIZE, 0);
+ PAGE_SIZE, 0);
delete_from_page_cache(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
+ page = NULL;
}
/*
@@ -476,7 +747,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
unlock_page:
if (page) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
goto out;
}
@@ -516,6 +787,24 @@ EXPORT_SYMBOL_GPL(dax_fault);
*/
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+static void __dax_dbg(struct buffer_head *bh, unsigned long address,
+ const char *reason, const char *fn)
+{
+ if (bh) {
+ char bname[BDEVNAME_SIZE];
+ bdevname(bh->b_bdev, bname);
+ pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
+ "length %zd fallback: %s\n", fn, current->comm,
+ address, bname, bh->b_state, (u64)bh->b_blocknr,
+ bh->b_size, reason);
+ } else {
+ pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
+ current->comm, address, reason);
+ }
+}
+
+#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
+
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, unsigned int flags, get_block_t get_block,
dax_iodone_t complete_unwritten)
@@ -527,57 +816,83 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
unsigned blkbits = inode->i_blkbits;
unsigned long pmd_addr = address & PMD_MASK;
bool write = flags & FAULT_FLAG_WRITE;
- long length;
- void __pmem *kaddr;
+ struct block_device *bdev;
pgoff_t size, pgoff;
- sector_t block, sector;
- unsigned long pfn;
- int result = 0;
+ sector_t block;
+ int error, result = 0;
+ bool alloc = false;
+
+ /* dax pmd mappings require pfn_t_devmap() */
+ if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
+ return VM_FAULT_FALLBACK;
/* Fall back to PTEs if we're going to COW */
- if (write && !(vma->vm_flags & VM_SHARED))
+ if (write && !(vma->vm_flags & VM_SHARED)) {
+ split_huge_pmd(vma, pmd, address);
+ dax_pmd_dbg(NULL, address, "cow write");
return VM_FAULT_FALLBACK;
+ }
/* If the PMD would extend outside the VMA */
- if (pmd_addr < vma->vm_start)
+ if (pmd_addr < vma->vm_start) {
+ dax_pmd_dbg(NULL, address, "vma start unaligned");
return VM_FAULT_FALLBACK;
- if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ }
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
+ dax_pmd_dbg(NULL, address, "vma end unaligned");
return VM_FAULT_FALLBACK;
+ }
pgoff = linear_page_index(vma, pmd_addr);
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (pgoff >= size)
return VM_FAULT_SIGBUS;
/* If the PMD would cover blocks out of the file */
- if ((pgoff | PG_PMD_COLOUR) >= size)
+ if ((pgoff | PG_PMD_COLOUR) >= size) {
+ dax_pmd_dbg(NULL, address,
+ "offset + huge page size > file size");
return VM_FAULT_FALLBACK;
+ }
memset(&bh, 0, sizeof(bh));
+ bh.b_bdev = inode->i_sb->s_bdev;
block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
bh.b_size = PMD_SIZE;
- length = get_block(inode, block, &bh, write);
- if (length)
+
+ if (get_block(inode, block, &bh, 0) != 0)
return VM_FAULT_SIGBUS;
- i_mmap_lock_read(mapping);
+
+ if (!buffer_mapped(&bh) && write) {
+ if (get_block(inode, block, &bh, 1) != 0)
+ return VM_FAULT_SIGBUS;
+ alloc = true;
+ }
+
+ bdev = bh.b_bdev;
/*
* If the filesystem isn't willing to tell us the length of a hole,
* just fall back to PTEs. Calling get_block 512 times in a loop
* would be silly.
*/
- if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
- goto fallback;
+ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
+ dax_pmd_dbg(&bh, address, "allocated block too small");
+ return VM_FAULT_FALLBACK;
+ }
/*
* If we allocated new storage, make sure no process has any
* zero pages covering this hole
*/
- if (buffer_new(&bh)) {
- i_mmap_unlock_read(mapping);
- unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
- i_mmap_lock_read(mapping);
+ if (alloc) {
+ loff_t lstart = pgoff << PAGE_SHIFT;
+ loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
+
+ truncate_pagecache_range(inode, lstart, lend);
}
+ i_mmap_lock_read(mapping);
+
/*
* If a truncate happened while we were allocating blocks, we may
* leave blocks allocated to the file that are beyond EOF. We can't
@@ -589,50 +904,108 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
result = VM_FAULT_SIGBUS;
goto out;
}
- if ((pgoff | PG_PMD_COLOUR) >= size)
+ if ((pgoff | PG_PMD_COLOUR) >= size) {
+ dax_pmd_dbg(&bh, address,
+ "offset + huge page size > file size");
goto fallback;
+ }
if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
spinlock_t *ptl;
pmd_t entry;
struct page *zero_page = get_huge_zero_page();
- if (unlikely(!zero_page))
+ if (unlikely(!zero_page)) {
+ dax_pmd_dbg(&bh, address, "no zero page");
goto fallback;
+ }
ptl = pmd_lock(vma->vm_mm, pmd);
if (!pmd_none(*pmd)) {
spin_unlock(ptl);
+ dax_pmd_dbg(&bh, address, "pmd already present");
goto fallback;
}
+ dev_dbg(part_to_dev(bdev->bd_part),
+ "%s: %s addr: %lx pfn: <zero> sect: %llx\n",
+ __func__, current->comm, address,
+ (unsigned long long) to_sector(&bh, inode));
+
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
result = VM_FAULT_NOPAGE;
spin_unlock(ptl);
} else {
- sector = bh.b_blocknr << (blkbits - 9);
- length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
- bh.b_size);
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(&bh, inode),
+ .size = PMD_SIZE,
+ };
+ long length = dax_map_atomic(bdev, &dax);
+
if (length < 0) {
result = VM_FAULT_SIGBUS;
goto out;
}
- if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+ if (length < PMD_SIZE) {
+ dax_pmd_dbg(&bh, address, "dax-length too small");
+ dax_unmap_atomic(bdev, &dax);
+ goto fallback;
+ }
+ if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
+ dax_pmd_dbg(&bh, address, "pfn unaligned");
+ dax_unmap_atomic(bdev, &dax);
goto fallback;
+ }
+
+ if (!pfn_t_devmap(dax.pfn)) {
+ dax_unmap_atomic(bdev, &dax);
+ dax_pmd_dbg(&bh, address, "pfn not in memmap");
+ goto fallback;
+ }
if (buffer_unwritten(&bh) || buffer_new(&bh)) {
- int i;
- for (i = 0; i < PTRS_PER_PMD; i++)
- clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+ clear_pmem(dax.addr, PMD_SIZE);
wmb_pmem();
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
result |= VM_FAULT_MAJOR;
}
+ dax_unmap_atomic(bdev, &dax);
+
+ /*
+ * For PTE faults we insert a radix tree entry for reads, and
+ * leave it clean. Then on the first write we dirty the radix
+ * tree entry via the dax_pfn_mkwrite() path. This sequence
+ * allows the dax_pfn_mkwrite() call to be simpler and avoid a
+ * call into get_block() to translate the pgoff to a sector in
+ * order to be able to create a new radix tree entry.
+ *
+ * The PMD path doesn't have an equivalent to
+ * dax_pfn_mkwrite(), though, so for a read followed by a
+ * write we traverse all the way through __dax_pmd_fault()
+ * twice. This means we can just skip inserting a radix tree
+ * entry completely on the initial read and just wait until
+ * the write to insert a dirty entry.
+ */
+ if (write) {
+ error = dax_radix_entry(mapping, pgoff, dax.sector,
+ true, true);
+ if (error) {
+ dax_pmd_dbg(&bh, address,
+ "PMD radix insertion failed");
+ goto fallback;
+ }
+ }
- result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+ dev_dbg(part_to_dev(bdev->bd_part),
+ "%s: %s addr: %lx pfn: %lx sect: %llx\n",
+ __func__, current->comm, address,
+ pfn_t_to_pfn(dax.pfn),
+ (unsigned long long) dax.sector);
+ result |= vmf_insert_pfn_pmd(vma, address, pmd,
+ dax.pfn, write);
}
out:
@@ -684,15 +1057,27 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
* @vmf: The description of the fault
- *
*/
int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ struct file *file = vma->vm_file;
+ int error;
- sb_start_pagefault(sb);
- file_update_time(vma->vm_file);
- sb_end_pagefault(sb);
+ /*
+ * We pass NO_SECTOR to dax_radix_entry() because we expect that a
+ * RADIX_DAX_PTE entry already exists in the radix tree from a
+ * previous call to __dax_fault(). We just want to look up that PTE
+ * entry using vmf->pgoff and make sure the dirty tag is set. This
+ * saves us from having to make a call to get_block() here to look
+ * up the sector.
+ */
+ error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
+ true);
+
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (error)
+ return VM_FAULT_SIGBUS;
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -709,7 +1094,7 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
* you are truncating a file, the helper function dax_truncate_page() may be
* more convenient.
*
- * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * We work in terms of PAGE_SIZE here for commonality with
* block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
* took care of disposing of the unnecessary blocks. Even if the filesystem
* block size is smaller than PAGE_SIZE, we have to zero the rest of the page
@@ -719,27 +1104,33 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
get_block_t get_block)
{
struct buffer_head bh;
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ pgoff_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE-1);
int err;
/* Block boundary? Nothing to do */
if (!length)
return 0;
- BUG_ON((offset + length) > PAGE_CACHE_SIZE);
+ BUG_ON((offset + length) > PAGE_SIZE);
memset(&bh, 0, sizeof(bh));
- bh.b_size = PAGE_CACHE_SIZE;
+ bh.b_bdev = inode->i_sb->s_bdev;
+ bh.b_size = PAGE_SIZE;
err = get_block(inode, index, &bh, 0);
if (err < 0)
return err;
if (buffer_written(&bh)) {
- void __pmem *addr;
- err = dax_get_addr(&bh, &addr, inode->i_blkbits);
- if (err < 0)
- return err;
- clear_pmem(addr + offset, length);
+ struct block_device *bdev = bh.b_bdev;
+ struct blk_dax_ctl dax = {
+ .sector = to_sector(&bh, inode),
+ .size = PAGE_SIZE,
+ };
+
+ if (dax_map_atomic(bdev, &dax) < 0)
+ return PTR_ERR(dax.addr);
+ clear_pmem(dax.addr + offset, length);
wmb_pmem();
+ dax_unmap_atomic(bdev, &dax);
}
return 0;
@@ -755,7 +1146,7 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
* Similar to block_truncate_page(), this function can be called by a
* filesystem when it is truncating a DAX file to handle the partial page.
*
- * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * We work in terms of PAGE_SIZE here for commonality with
* block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
* took care of disposing of the unnecessary blocks. Even if the filesystem
* block size is smaller than PAGE_SIZE, we have to zero the rest of the page
@@ -763,7 +1154,7 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
*/
int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
{
- unsigned length = PAGE_CACHE_ALIGN(from) - from;
+ unsigned length = PAGE_ALIGN(from) - from;
return dax_zero_page_range(inode, from, length, get_block);
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c33aeb0f68f..d5ecc6e477da 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -269,9 +269,6 @@ static inline int dname_external(const struct dentry *dentry)
return dentry->d_name.name != dentry->d_iname;
}
-/*
- * Make sure other CPUs see the inode attached before the type is set.
- */
static inline void __d_set_inode_and_type(struct dentry *dentry,
struct inode *inode,
unsigned type_flags)
@@ -279,28 +276,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
unsigned flags;
dentry->d_inode = inode;
- smp_wmb();
flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
flags |= type_flags;
WRITE_ONCE(dentry->d_flags, flags);
}
-/*
- * Ideally, we want to make sure that other CPUs see the flags cleared before
- * the inode is detached, but this is really a violation of RCU principles
- * since the ordering suggests we should always set inode before flags.
- *
- * We should instead replace or discard the entire dentry - but that sucks
- * performancewise on mass deletion/rename.
- */
static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
unsigned flags = READ_ONCE(dentry->d_flags);
flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
WRITE_ONCE(dentry->d_flags, flags);
- smp_wmb();
dentry->d_inode = NULL;
}
@@ -370,9 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
+
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
- dentry_rcuwalk_invalidate(dentry);
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@@ -1571,7 +1560,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
if (name->len > DNAME_INLINE_LEN-1) {
size_t size = offsetof(struct external_name, name[1]);
- struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
+ struct external_name *p = kmalloc(size + name->len,
+ GFP_KERNEL_ACCOUNT);
if (!p) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
@@ -1677,7 +1667,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
DCACHE_OP_REVALIDATE |
DCACHE_OP_WEAK_REVALIDATE |
DCACHE_OP_DELETE |
- DCACHE_OP_SELECT_INODE));
+ DCACHE_OP_SELECT_INODE |
+ DCACHE_OP_REAL));
dentry->d_op = op;
if (!op)
return;
@@ -1695,6 +1686,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
dentry->d_flags |= DCACHE_OP_PRUNE;
if (op->d_select_inode)
dentry->d_flags |= DCACHE_OP_SELECT_INODE;
+ if (op->d_real)
+ dentry->d_flags |= DCACHE_OP_REAL;
}
EXPORT_SYMBOL(d_set_d_op);
@@ -1734,7 +1727,7 @@ static unsigned d_flags_for_inode(struct inode *inode)
}
if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
- if (unlikely(inode->i_op->follow_link)) {
+ if (unlikely(inode->i_op->get_link)) {
add_flags = DCACHE_SYMLINK_TYPE;
goto type_determined;
}
@@ -1755,12 +1748,12 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
unsigned add_flags = d_flags_for_inode(inode);
spin_lock(&dentry->d_lock);
- if (inode)
- hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
- dentry_rcuwalk_invalidate(dentry);
+ raw_write_seqcount_end(&dentry->d_seq);
+ __fsnotify_d_instantiate(dentry);
spin_unlock(&dentry->d_lock);
- fsnotify_d_instantiate(dentry, inode);
}
/**
@@ -1781,91 +1774,16 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
void d_instantiate(struct dentry *entry, struct inode * inode)
{
BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
- if (inode)
+ if (inode) {
spin_lock(&inode->i_lock);
- __d_instantiate(entry, inode);
- if (inode)
+ __d_instantiate(entry, inode);
spin_unlock(&inode->i_lock);
+ }
security_d_instantiate(entry, inode);
}
EXPORT_SYMBOL(d_instantiate);
/**
- * d_instantiate_unique - instantiate a non-aliased dentry
- * @entry: dentry to instantiate
- * @inode: inode to attach to this dentry
- *
- * Fill in inode information in the entry. On success, it returns NULL.
- * If an unhashed alias of "entry" already exists, then we return the
- * aliased dentry instead and drop one reference to inode.
- *
- * Note that in order to avoid conflicts with rename() etc, the caller
- * had better be holding the parent directory semaphore.
- *
- * This also assumes that the inode count has been incremented
- * (or otherwise set) by the caller to indicate that it is now
- * in use by the dcache.
- */
-static struct dentry *__d_instantiate_unique(struct dentry *entry,
- struct inode *inode)
-{
- struct dentry *alias;
- int len = entry->d_name.len;
- const char *name = entry->d_name.name;
- unsigned int hash = entry->d_name.hash;
-
- if (!inode) {
- __d_instantiate(entry, NULL);
- return NULL;
- }
-
- hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
- /*
- * Don't need alias->d_lock here, because aliases with
- * d_parent == entry->d_parent are not subject to name or
- * parent changes, because the parent inode i_mutex is held.
- */
- if (alias->d_name.hash != hash)
- continue;
- if (alias->d_parent != entry->d_parent)
- continue;
- if (alias->d_name.len != len)
- continue;
- if (dentry_cmp(alias, name, len))
- continue;
- __dget(alias);
- return alias;
- }
-
- __d_instantiate(entry, inode);
- return NULL;
-}
-
-struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
-{
- struct dentry *result;
-
- BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
-
- if (inode)
- spin_lock(&inode->i_lock);
- result = __d_instantiate_unique(entry, inode);
- if (inode)
- spin_unlock(&inode->i_lock);
-
- if (!result) {
- security_d_instantiate(entry, inode);
- return NULL;
- }
-
- BUG_ON(!d_unhashed(result));
- iput(inode);
- return result;
-}
-
-EXPORT_SYMBOL(d_instantiate_unique);
-
-/**
* d_instantiate_no_diralias - instantiate a non-aliased dentry
* @entry: dentry to complete
* @inode: inode to attach to this dentry
@@ -2445,6 +2363,86 @@ void d_rehash(struct dentry * entry)
}
EXPORT_SYMBOL(d_rehash);
+
+/* inode->i_lock held if inode is non-NULL */
+
+static inline void __d_add(struct dentry *dentry, struct inode *inode)
+{
+ if (inode) {
+ __d_instantiate(dentry, inode);
+ spin_unlock(&inode->i_lock);
+ }
+ security_d_instantiate(dentry, inode);
+ d_rehash(dentry);
+}
+
+/**
+ * d_add - add dentry to hash queues
+ * @entry: dentry to add
+ * @inode: The inode to attach to this dentry
+ *
+ * This adds the entry to the hash queues and initializes @inode.
+ * The entry was actually filled in earlier during d_alloc().
+ */
+
+void d_add(struct dentry *entry, struct inode *inode)
+{
+ if (inode)
+ spin_lock(&inode->i_lock);
+ __d_add(entry, inode);
+}
+EXPORT_SYMBOL(d_add);
+
+/**
+ * d_exact_alias - find and hash an exact unhashed alias
+ * @entry: dentry to add
+ * @inode: The inode to go with this dentry
+ *
+ * If an unhashed dentry with the same name/parent and desired
+ * inode already exists, hash and return it. Otherwise, return
+ * NULL.
+ *
+ * Parent directory should be locked.
+ */
+struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
+{
+ struct dentry *alias;
+ int len = entry->d_name.len;
+ const char *name = entry->d_name.name;
+ unsigned int hash = entry->d_name.hash;
+
+ spin_lock(&inode->i_lock);
+ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+ /*
+ * Don't need alias->d_lock here, because aliases with
+ * d_parent == entry->d_parent are not subject to name or
+ * parent changes, because the parent inode i_mutex is held.
+ */
+ if (alias->d_name.hash != hash)
+ continue;
+ if (alias->d_parent != entry->d_parent)
+ continue;
+ if (alias->d_name.len != len)
+ continue;
+ if (dentry_cmp(alias, name, len))
+ continue;
+ spin_lock(&alias->d_lock);
+ if (!d_unhashed(alias)) {
+ spin_unlock(&alias->d_lock);
+ alias = NULL;
+ } else {
+ __dget_dlock(alias);
+ _d_rehash(alias);
+ spin_unlock(&alias->d_lock);
+ }
+ spin_unlock(&inode->i_lock);
+ return alias;
+ }
+ spin_unlock(&inode->i_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(d_exact_alias);
+
/**
* dentry_update_name_case - update case insensitive dentry with a new name
* @dentry: dentry to be updated
@@ -2461,7 +2459,7 @@ EXPORT_SYMBOL(d_rehash);
*/
void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
{
- BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
+ BUG_ON(!inode_is_locked(dentry->d_parent->d_inode));
BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
spin_lock(&dentry->d_lock);
@@ -2737,7 +2735,7 @@ static int __d_unalias(struct inode *inode,
if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
goto out_err;
m1 = &dentry->d_sb->s_vfs_rename_mutex;
- if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
+ if (!inode_trylock(alias->d_parent->d_inode))
goto out_err;
m2 = &alias->d_parent->d_inode->i_mutex;
out_unalias:
@@ -2781,10 +2779,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
BUG_ON(!d_unhashed(dentry));
- if (!inode) {
- __d_instantiate(dentry, NULL);
+ if (!inode)
goto out;
- }
+
spin_lock(&inode->i_lock);
if (S_ISDIR(inode->i_mode)) {
struct dentry *new = __d_find_any_alias(inode);
@@ -2818,12 +2815,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
return new;
}
}
- /* already taking inode->i_lock, so d_add() by hand */
- __d_instantiate(dentry, inode);
- spin_unlock(&inode->i_lock);
out:
- security_d_instantiate(dentry, inode);
- d_rehash(dentry);
+ __d_add(dentry, inode);
return NULL;
}
EXPORT_SYMBOL(d_splice_alias);
@@ -3303,18 +3296,18 @@ out:
* @new_dentry: new dentry
* @old_dentry: old dentry
*
- * Returns 1 if new_dentry is a subdirectory of the parent (at any depth).
- * Returns 0 otherwise.
+ * Returns true if new_dentry is a subdirectory of the parent (at any depth).
+ * Returns false otherwise.
* Caller must ensure that "new_dentry" is pinned before calling is_subdir()
*/
-int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
+bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
- int result;
+ bool result;
unsigned seq;
if (new_dentry == old_dentry)
- return 1;
+ return true;
do {
/* for restarting inner loop in case of seq retry */
@@ -3325,9 +3318,9 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
*/
rcu_read_lock();
if (d_ancestor(old_dentry, new_dentry))
- result = 1;
+ result = true;
else
- result = 0;
+ result = false;
rcu_read_unlock();
} while (read_seqretry(&rename_lock, seq));
@@ -3415,7 +3408,7 @@ static void __init dcache_init(void)
* of the dcache.
*/
dentry_cache = KMEM_CACHE(dentry,
- SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+ SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
/* Hash may have been set up in dcache_init_early */
if (!hashdist)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 5d8f35f1382a..8580831ed237 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -265,20 +265,24 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
dentry = lookup_one_len(name, parent, strlen(name));
if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
dput(dentry);
dentry = ERR_PTR(-EEXIST);
}
- if (IS_ERR(dentry))
- mutex_unlock(&d_inode(parent)->i_mutex);
+
+ if (IS_ERR(dentry)) {
+ inode_unlock(d_inode(parent));
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ }
+
return dentry;
}
static struct dentry *failed_creating(struct dentry *dentry)
{
- mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+ inode_unlock(d_inode(dentry->d_parent));
dput(dentry);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
return NULL;
@@ -286,7 +290,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
static struct dentry *end_creating(struct dentry *dentry)
{
- mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+ inode_unlock(d_inode(dentry->d_parent));
return dentry;
}
@@ -453,7 +457,7 @@ struct dentry *debugfs_create_automount(const char *name,
if (unlikely(!inode))
return failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ make_empty_dir_inode(inode);
inode->i_flags |= S_AUTOMOUNT;
inode->i_private = data;
dentry->d_fsdata = (void *)f;
@@ -556,9 +560,9 @@ void debugfs_remove(struct dentry *dentry)
if (!parent || d_really_is_negative(parent))
return;
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
ret = __debugfs_remove(dentry, parent);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
if (!ret)
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
@@ -590,7 +594,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
parent = dentry;
down:
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
loop:
/*
* The parent->d_subdirs is protected by the d_lock. Outside that
@@ -605,7 +609,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
/* perhaps simple_empty(child) makes more sense */
if (!list_empty(&child->d_subdirs)) {
spin_unlock(&parent->d_lock);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
parent = child;
goto down;
}
@@ -626,10 +630,10 @@ void debugfs_remove_recursive(struct dentry *dentry)
}
spin_unlock(&parent->d_lock);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
child = parent;
parent = parent->d_parent;
- mutex_lock(&d_inode(parent)->i_mutex);
+ inode_lock(d_inode(parent));
if (child != dentry)
/* go up */
@@ -637,7 +641,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
if (!__debugfs_remove(child, parent))
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
- mutex_unlock(&d_inode(parent)->i_mutex);
+ inode_unlock(d_inode(parent));
}
EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c35ffdc12bba..655f21f99160 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -255,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb)
if (!uid_valid(root_uid) || !gid_valid(root_gid))
return -EINVAL;
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
/* If we have already created ptmx node, return */
if (fsi->ptmx_dentry) {
@@ -292,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb)
fsi->ptmx_dentry = dentry;
rc = 0;
out:
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
return rc;
}
@@ -575,6 +575,26 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
mutex_unlock(&allocated_ptys_lock);
}
+/*
+ * pty code needs to hold extra references in case of last /dev/tty close
+ */
+
+void devpts_add_ref(struct inode *ptmx_inode)
+{
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+
+ atomic_inc(&sb->s_active);
+ ihold(ptmx_inode);
+}
+
+void devpts_del_ref(struct inode *ptmx_inode)
+{
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+
+ iput(ptmx_inode);
+ deactivate_super(sb);
+}
+
/**
* devpts_pty_new -- create a new inode in /dev/pts/
* @ptmx_inode: inode of the master
@@ -615,7 +635,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
sprintf(s, "%d", index);
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
dentry = d_alloc_name(root, s);
if (dentry) {
@@ -626,7 +646,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
inode = ERR_PTR(-ENOMEM);
}
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
return inode;
}
@@ -671,7 +691,7 @@ void devpts_pty_kill(struct inode *inode)
BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
dentry = d_find_alias(inode);
@@ -680,7 +700,7 @@ void devpts_pty_kill(struct inode *inode)
dput(dentry); /* d_alloc_name() in devpts_pty_new() */
dput(dentry); /* d_find_alias above */
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
}
static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3ae0e0427191..472037732daf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -109,6 +109,8 @@ struct dio_submit {
struct dio {
int flags; /* doesn't change */
int rw;
+ blk_qc_t bio_cookie;
+ struct block_device *bio_bdev;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
dio_iodone_t *end_io; /* IO completion function */
@@ -170,7 +172,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
*/
if (dio->page_errors == 0)
dio->page_errors = ret;
- page_cache_get(page);
+ get_page(page);
dio->pages[0] = page;
sdio->head = 0;
sdio->tail = 1;
@@ -251,8 +253,13 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
if (ret == 0)
ret = transferred;
- if (dio->end_io && dio->result)
- dio->end_io(dio->iocb, offset, transferred, dio->private);
+ if (dio->end_io) {
+ int err;
+
+ err = dio->end_io(dio->iocb, offset, ret, dio->private);
+ if (err)
+ ret = err;
+ }
if (!(dio->flags & DIO_SKIP_DIO_COUNT))
inode_dio_end(dio->inode);
@@ -361,7 +368,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
/*
* bio_alloc() is guaranteed to return a bio when called with
- * __GFP_WAIT and we request a valid number of vectors.
+ * __GFP_RECLAIM and we request a valid number of vectors.
*/
bio = bio_alloc(GFP_KERNEL, nr_vecs);
@@ -397,11 +404,14 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
if (dio->is_async && dio->rw == READ && dio->should_dirty)
bio_set_pages_dirty(bio);
- if (sdio->submit_io)
+ dio->bio_bdev = bio->bi_bdev;
+
+ if (sdio->submit_io) {
sdio->submit_io(dio->rw, bio, dio->inode,
sdio->logical_offset_in_bio);
- else
- submit_bio(dio->rw, bio);
+ dio->bio_cookie = BLK_QC_T_NONE;
+ } else
+ dio->bio_cookie = submit_bio(dio->rw, bio);
sdio->bio = NULL;
sdio->boundary = 0;
@@ -414,7 +424,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
{
while (sdio->head < sdio->tail)
- page_cache_release(dio->pages[sdio->head++]);
+ put_page(dio->pages[sdio->head++]);
}
/*
@@ -440,7 +450,9 @@ static struct bio *dio_await_one(struct dio *dio)
__set_current_state(TASK_UNINTERRUPTIBLE);
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- io_schedule();
+ if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
+ !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+ io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
@@ -466,8 +478,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
dio->io_error = -EIO;
if (dio->is_async && dio->rw == READ && dio->should_dirty) {
- bio_check_pages_dirty(bio); /* transfers ownership */
err = bio->bi_error;
+ bio_check_pages_dirty(bio); /* transfers ownership */
} else {
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
@@ -475,7 +487,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
if (dio->rw == READ && !PageCompound(page) &&
dio->should_dirty)
set_page_dirty_lock(page);
- page_cache_release(page);
+ put_page(page);
}
err = bio->bi_error;
bio_put(bio);
@@ -684,7 +696,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio)
*/
if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
sdio->pages_in_io--;
- page_cache_get(sdio->cur_page);
+ get_page(sdio->cur_page);
sdio->final_block_in_bio = sdio->cur_page_block +
(sdio->cur_page_len >> sdio->blkbits);
ret = 0;
@@ -798,13 +810,13 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
*/
if (sdio->cur_page) {
ret = dio_send_cur_page(dio, sdio, map_bh);
- page_cache_release(sdio->cur_page);
+ put_page(sdio->cur_page);
sdio->cur_page = NULL;
if (ret)
return ret;
}
- page_cache_get(page); /* It is in dio */
+ get_page(page); /* It is in dio */
sdio->cur_page = page;
sdio->cur_page_offset = offset;
sdio->cur_page_len = len;
@@ -818,7 +830,7 @@ out:
if (sdio->boundary) {
ret = dio_send_cur_page(dio, sdio, map_bh);
dio_bio_submit(dio, sdio);
- page_cache_release(sdio->cur_page);
+ put_page(sdio->cur_page);
sdio->cur_page = NULL;
}
return ret;
@@ -935,7 +947,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
ret = get_more_blocks(dio, sdio, map_bh);
if (ret) {
- page_cache_release(page);
+ put_page(page);
goto out;
}
if (!buffer_mapped(map_bh))
@@ -976,7 +988,7 @@ do_holes:
/* AKPM: eargh, -ENOTBLK is a hack */
if (dio->rw & WRITE) {
- page_cache_release(page);
+ put_page(page);
return -ENOTBLK;
}
@@ -989,7 +1001,7 @@ do_holes:
if (sdio->block_in_file >=
i_size_aligned >> blkbits) {
/* We hit eof */
- page_cache_release(page);
+ put_page(page);
goto out;
}
zero_user(page, from, 1 << blkbits);
@@ -1029,7 +1041,7 @@ do_holes:
sdio->next_block_for_io,
map_bh);
if (ret) {
- page_cache_release(page);
+ put_page(page);
goto out;
}
sdio->next_block_for_io += this_chunk_blocks;
@@ -1045,7 +1057,7 @@ next_block:
}
/* Drop the ref which was taken in get_user_pages() */
- page_cache_release(page);
+ put_page(page);
}
out:
return ret;
@@ -1151,18 +1163,28 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
iocb->ki_filp->f_mapping;
/* will be released by direct_io_worker */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = filemap_write_and_wait_range(mapping, offset,
end - 1);
if (retval) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
kmem_cache_free(dio_cache, dio);
goto out;
}
}
}
+ /* Once we sampled i_size check for reads beyond EOF */
+ dio->i_size = i_size_read(inode);
+ if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
+ if (dio->flags & DIO_LOCKING)
+ inode_unlock(inode);
+ kmem_cache_free(dio_cache, dio);
+ retval = 0;
+ goto out;
+ }
+
/*
* For file extending writes updating i_size before data writeouts
* complete can expose uninitialized blocks in dumb filesystems.
@@ -1216,7 +1238,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
sdio.next_block_for_io = -1;
dio->iocb = iocb;
- dio->i_size = i_size_read(inode);
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
@@ -1260,7 +1281,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
if (retval == 0)
retval = ret2;
- page_cache_release(sdio.cur_page);
+ put_page(sdio.cur_page);
sdio.cur_page = NULL;
}
if (sdio.bio)
@@ -1280,7 +1301,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
* of protecting us from looking up uninitialized blocks.
*/
if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
- mutex_unlock(&dio->inode->i_mutex);
+ inode_unlock(dio->inode);
/*
* The only time we want to leave bios in flight is when a successful
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index d521bddf876d..1669f6291c95 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -61,35 +61,8 @@ static struct config_item *make_node(struct config_group *, const char *);
static void drop_node(struct config_group *, struct config_item *);
static void release_node(struct config_item *);
-static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
- char *buf);
-static ssize_t store_cluster(struct config_item *i,
- struct configfs_attribute *a,
- const char *buf, size_t len);
-static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
- char *buf);
-static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len);
-static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
- char *buf);
-static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len);
-
-static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
- size_t len);
-static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
-static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
- size_t len);
-static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
- size_t len);
-static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf);
-static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
-static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
- size_t len);
-static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
-static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
- size_t len);
+static struct configfs_attribute *comm_attrs[];
+static struct configfs_attribute *node_attrs[];
struct dlm_cluster {
struct config_group group;
@@ -108,6 +81,12 @@ struct dlm_cluster {
char cl_cluster_name[DLM_LOCKSPACE_LEN];
};
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
+{
+ return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+ NULL;
+}
+
enum {
CLUSTER_ATTR_TCP_PORT = 0,
CLUSTER_ATTR_BUFFER_SIZE,
@@ -124,33 +103,24 @@ enum {
CLUSTER_ATTR_CLUSTER_NAME,
};
-struct cluster_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct dlm_cluster *, char *);
- ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
-};
-
-static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
+static ssize_t cluster_cluster_name_show(struct config_item *item, char *buf)
{
+ struct dlm_cluster *cl = config_item_to_cluster(item);
return sprintf(buf, "%s\n", cl->cl_cluster_name);
}
-static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
+static ssize_t cluster_cluster_name_store(struct config_item *item,
const char *buf, size_t len)
{
+ struct dlm_cluster *cl = config_item_to_cluster(item);
+
strlcpy(dlm_config.ci_cluster_name, buf,
sizeof(dlm_config.ci_cluster_name));
strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name));
return len;
}
-static struct cluster_attribute cluster_attr_cluster_name = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "cluster_name",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = cluster_cluster_name_read,
- .store = cluster_cluster_name_write,
-};
+CONFIGFS_ATTR(cluster_, cluster_name);
static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
int *info_field, int check_zero,
@@ -175,17 +145,19 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
}
#define CLUSTER_ATTR(name, check_zero) \
-static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
+static ssize_t cluster_##name##_store(struct config_item *item, \
+ const char *buf, size_t len) \
{ \
+ struct dlm_cluster *cl = config_item_to_cluster(item); \
return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
check_zero, buf, len); \
} \
-static ssize_t name##_read(struct dlm_cluster *cl, char *buf) \
+static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
{ \
+ struct dlm_cluster *cl = config_item_to_cluster(item); \
return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \
} \
-static struct cluster_attribute cluster_attr_##name = \
-__CONFIGFS_ATTR(name, 0644, name##_read, name##_write)
+CONFIGFS_ATTR(cluster_, name);
CLUSTER_ATTR(tcp_port, 1);
CLUSTER_ATTR(buffer_size, 1);
@@ -201,19 +173,19 @@ CLUSTER_ATTR(new_rsb_count, 0);
CLUSTER_ATTR(recover_callbacks, 0);
static struct configfs_attribute *cluster_attrs[] = {
- [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
- [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr,
- [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr,
- [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr,
- [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
- [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
- [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
- [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
- [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
- [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
- [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
- [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
- [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
+ [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port,
+ [CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size,
+ [CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size,
+ [CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer,
+ [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs,
+ [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs,
+ [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug,
+ [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
+ [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
+ [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
+ [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
+ [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
+ [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
NULL,
};
@@ -224,83 +196,11 @@ enum {
COMM_ATTR_ADDR_LIST,
};
-struct comm_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct dlm_comm *, char *);
- ssize_t (*store)(struct dlm_comm *, const char *, size_t);
-};
-
-static struct comm_attribute comm_attr_nodeid = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "nodeid",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = comm_nodeid_read,
- .store = comm_nodeid_write,
-};
-
-static struct comm_attribute comm_attr_local = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "local",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = comm_local_read,
- .store = comm_local_write,
-};
-
-static struct comm_attribute comm_attr_addr = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "addr",
- .ca_mode = S_IWUSR },
- .store = comm_addr_write,
-};
-
-static struct comm_attribute comm_attr_addr_list = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "addr_list",
- .ca_mode = S_IRUGO },
- .show = comm_addr_list_read,
-};
-
-static struct configfs_attribute *comm_attrs[] = {
- [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
- [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
- [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
- [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list.attr,
- NULL,
-};
-
enum {
NODE_ATTR_NODEID = 0,
NODE_ATTR_WEIGHT,
};
-struct node_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct dlm_node *, char *);
- ssize_t (*store)(struct dlm_node *, const char *, size_t);
-};
-
-static struct node_attribute node_attr_nodeid = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "nodeid",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = node_nodeid_read,
- .store = node_nodeid_write,
-};
-
-static struct node_attribute node_attr_weight = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "weight",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = node_weight_read,
- .store = node_weight_write,
-};
-
-static struct configfs_attribute *node_attrs[] = {
- [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
- [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
- NULL,
-};
-
struct dlm_clusters {
struct configfs_subsystem subsys;
};
@@ -349,8 +249,6 @@ static struct configfs_group_operations clusters_ops = {
static struct configfs_item_operations cluster_ops = {
.release = release_cluster,
- .show_attribute = show_cluster,
- .store_attribute = store_cluster,
};
static struct configfs_group_operations spaces_ops = {
@@ -369,8 +267,6 @@ static struct configfs_group_operations comms_ops = {
static struct configfs_item_operations comm_ops = {
.release = release_comm,
- .show_attribute = show_comm,
- .store_attribute = store_comm,
};
static struct configfs_group_operations nodes_ops = {
@@ -380,8 +276,6 @@ static struct configfs_group_operations nodes_ops = {
static struct configfs_item_operations node_ops = {
.release = release_node,
- .show_attribute = show_node,
- .store_attribute = store_node,
};
static struct config_item_type clusters_type = {
@@ -427,12 +321,6 @@ static struct config_item_type node_type = {
.ct_owner = THIS_MODULE,
};
-static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
-{
- return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
- NULL;
-}
-
static struct dlm_space *config_item_to_space(struct config_item *i)
{
return i ? container_of(to_config_group(i), struct dlm_space, group) :
@@ -455,24 +343,20 @@ static struct config_group *make_cluster(struct config_group *g,
struct dlm_cluster *cl = NULL;
struct dlm_spaces *sps = NULL;
struct dlm_comms *cms = NULL;
- void *gps = NULL;
cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
- gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
- if (!cl || !gps || !sps || !cms)
+ if (!cl || !sps || !cms)
goto fail;
config_group_init_type_name(&cl->group, name, &cluster_type);
config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
- cl->group.default_groups = gps;
- cl->group.default_groups[0] = &sps->ss_group;
- cl->group.default_groups[1] = &cms->cs_group;
- cl->group.default_groups[2] = NULL;
+ configfs_add_default_group(&sps->ss_group, &cl->group);
+ configfs_add_default_group(&cms->cs_group, &cl->group);
cl->cl_tcp_port = dlm_config.ci_tcp_port;
cl->cl_buffer_size = dlm_config.ci_buffer_size;
@@ -495,7 +379,6 @@ static struct config_group *make_cluster(struct config_group *g,
fail:
kfree(cl);
- kfree(gps);
kfree(sps);
kfree(cms);
return ERR_PTR(-ENOMEM);
@@ -504,14 +387,8 @@ static struct config_group *make_cluster(struct config_group *g,
static void drop_cluster(struct config_group *g, struct config_item *i)
{
struct dlm_cluster *cl = config_item_to_cluster(i);
- struct config_item *tmp;
- int j;
- for (j = 0; cl->group.default_groups[j]; j++) {
- tmp = &cl->group.default_groups[j]->cg_item;
- cl->group.default_groups[j] = NULL;
- config_item_put(tmp);
- }
+ configfs_remove_default_groups(&cl->group);
space_list = NULL;
comm_list = NULL;
@@ -522,7 +399,6 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
static void release_cluster(struct config_item *i)
{
struct dlm_cluster *cl = config_item_to_cluster(i);
- kfree(cl->group.default_groups);
kfree(cl);
}
@@ -530,21 +406,17 @@ static struct config_group *make_space(struct config_group *g, const char *name)
{
struct dlm_space *sp = NULL;
struct dlm_nodes *nds = NULL;
- void *gps = NULL;
sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
- gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
- if (!sp || !gps || !nds)
+ if (!sp || !nds)
goto fail;
config_group_init_type_name(&sp->group, name, &space_type);
- config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
- sp->group.default_groups = gps;
- sp->group.default_groups[0] = &nds->ns_group;
- sp->group.default_groups[1] = NULL;
+ config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+ configfs_add_default_group(&nds->ns_group, &sp->group);
INIT_LIST_HEAD(&sp->members);
mutex_init(&sp->members_lock);
@@ -553,7 +425,6 @@ static struct config_group *make_space(struct config_group *g, const char *name)
fail:
kfree(sp);
- kfree(gps);
kfree(nds);
return ERR_PTR(-ENOMEM);
}
@@ -561,24 +432,16 @@ static struct config_group *make_space(struct config_group *g, const char *name)
static void drop_space(struct config_group *g, struct config_item *i)
{
struct dlm_space *sp = config_item_to_space(i);
- struct config_item *tmp;
- int j;
/* assert list_empty(&sp->members) */
- for (j = 0; sp->group.default_groups[j]; j++) {
- tmp = &sp->group.default_groups[j]->cg_item;
- sp->group.default_groups[j] = NULL;
- config_item_put(tmp);
- }
-
+ configfs_remove_default_groups(&sp->group);
config_item_put(i);
}
static void release_space(struct config_item *i)
{
struct dlm_space *sp = config_item_to_space(i);
- kfree(sp->group.default_groups);
kfree(sp);
}
@@ -687,66 +550,30 @@ void dlm_config_exit(void)
* Functions for user space to read/write attributes
*/
-static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
- char *buf)
+static ssize_t comm_nodeid_show(struct config_item *item, char *buf)
{
- struct dlm_cluster *cl = config_item_to_cluster(i);
- struct cluster_attribute *cla =
- container_of(a, struct cluster_attribute, attr);
- return cla->show ? cla->show(cl, buf) : 0;
+ return sprintf(buf, "%d\n", config_item_to_comm(item)->nodeid);
}
-static ssize_t store_cluster(struct config_item *i,
- struct configfs_attribute *a,
- const char *buf, size_t len)
-{
- struct dlm_cluster *cl = config_item_to_cluster(i);
- struct cluster_attribute *cla =
- container_of(a, struct cluster_attribute, attr);
- return cla->store ? cla->store(cl, buf, len) : -EINVAL;
-}
-
-static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
- char *buf)
-{
- struct dlm_comm *cm = config_item_to_comm(i);
- struct comm_attribute *cma =
- container_of(a, struct comm_attribute, attr);
- return cma->show ? cma->show(cm, buf) : 0;
-}
-
-static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len)
-{
- struct dlm_comm *cm = config_item_to_comm(i);
- struct comm_attribute *cma =
- container_of(a, struct comm_attribute, attr);
- return cma->store ? cma->store(cm, buf, len) : -EINVAL;
-}
-
-static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
-{
- return sprintf(buf, "%d\n", cm->nodeid);
-}
-
-static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+static ssize_t comm_nodeid_store(struct config_item *item, const char *buf,
size_t len)
{
- int rc = kstrtoint(buf, 0, &cm->nodeid);
+ int rc = kstrtoint(buf, 0, &config_item_to_comm(item)->nodeid);
if (rc)
return rc;
return len;
}
-static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
+static ssize_t comm_local_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", cm->local);
+ return sprintf(buf, "%d\n", config_item_to_comm(item)->local);
}
-static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+static ssize_t comm_local_store(struct config_item *item, const char *buf,
size_t len)
{
+ struct dlm_comm *cm = config_item_to_comm(item);
int rc = kstrtoint(buf, 0, &cm->local);
if (rc)
@@ -756,8 +583,10 @@ static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
return len;
}
-static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
+static ssize_t comm_addr_store(struct config_item *item, const char *buf,
+ size_t len)
{
+ struct dlm_comm *cm = config_item_to_comm(item);
struct sockaddr_storage *addr;
int rv;
@@ -783,8 +612,9 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
return len;
}
-static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf)
+static ssize_t comm_addr_list_show(struct config_item *item, char *buf)
{
+ struct dlm_comm *cm = config_item_to_comm(item);
ssize_t s;
ssize_t allowance;
int i;
@@ -827,32 +657,28 @@ static ssize_t comm_addr_list_read(struct dlm_comm *cm, char *buf)
return 4096 - allowance;
}
-static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
- char *buf)
-{
- struct dlm_node *nd = config_item_to_node(i);
- struct node_attribute *nda =
- container_of(a, struct node_attribute, attr);
- return nda->show ? nda->show(nd, buf) : 0;
-}
+CONFIGFS_ATTR(comm_, nodeid);
+CONFIGFS_ATTR(comm_, local);
+CONFIGFS_ATTR_WO(comm_, addr);
+CONFIGFS_ATTR_RO(comm_, addr_list);
-static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
- const char *buf, size_t len)
-{
- struct dlm_node *nd = config_item_to_node(i);
- struct node_attribute *nda =
- container_of(a, struct node_attribute, attr);
- return nda->store ? nda->store(nd, buf, len) : -EINVAL;
-}
+static struct configfs_attribute *comm_attrs[] = {
+ [COMM_ATTR_NODEID] = &comm_attr_nodeid,
+ [COMM_ATTR_LOCAL] = &comm_attr_local,
+ [COMM_ATTR_ADDR] = &comm_attr_addr,
+ [COMM_ATTR_ADDR_LIST] = &comm_attr_addr_list,
+ NULL,
+};
-static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
+static ssize_t node_nodeid_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", nd->nodeid);
+ return sprintf(buf, "%d\n", config_item_to_node(item)->nodeid);
}
-static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+static ssize_t node_nodeid_store(struct config_item *item, const char *buf,
size_t len)
{
+ struct dlm_node *nd = config_item_to_node(item);
uint32_t seq = 0;
int rc = kstrtoint(buf, 0, &nd->nodeid);
@@ -863,21 +689,30 @@ static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
return len;
}
-static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
+static ssize_t node_weight_show(struct config_item *item, char *buf)
{
- return sprintf(buf, "%d\n", nd->weight);
+ return sprintf(buf, "%d\n", config_item_to_node(item)->weight);
}
-static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+static ssize_t node_weight_store(struct config_item *item, const char *buf,
size_t len)
{
- int rc = kstrtoint(buf, 0, &nd->weight);
+ int rc = kstrtoint(buf, 0, &config_item_to_node(item)->weight);
if (rc)
return rc;
return len;
}
+CONFIGFS_ATTR(node_, nodeid);
+CONFIGFS_ATTR(node_, weight);
+
+static struct configfs_attribute *node_attrs[] = {
+ [NODE_ATTR_NODEID] = &node_attr_nodeid,
+ [NODE_ATTR_WEIGHT] = &node_attr_weight,
+ NULL,
+};
+
/*
* Functions for the dlm to get the info that's been configured
*/
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 87e9d796cf7d..1ab012a27d9f 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -124,7 +124,10 @@ struct connection {
struct connection *othercon;
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
- void (*orig_error_report)(struct sock *sk);
+ void (*orig_error_report)(struct sock *);
+ void (*orig_data_ready)(struct sock *);
+ void (*orig_state_change)(struct sock *);
+ void (*orig_write_space)(struct sock *);
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -421,7 +424,7 @@ static void lowcomms_write_space(struct sock *sk)
if (test_and_clear_bit(CF_APP_LIMITED, &con->flags)) {
con->sock->sk->sk_write_pending--;
- clear_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags);
+ clear_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags);
}
if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
@@ -467,16 +470,24 @@ int dlm_lowcomms_connect_node(int nodeid)
static void lowcomms_error_report(struct sock *sk)
{
- struct connection *con = sock2con(sk);
+ struct connection *con;
struct sockaddr_storage saddr;
+ int buflen;
+ void (*orig_report)(struct sock *) = NULL;
- if (nodeid_to_addr(con->nodeid, &saddr, NULL, false)) {
+ read_lock_bh(&sk->sk_callback_lock);
+ con = sock2con(sk);
+ if (con == NULL)
+ goto out;
+
+ orig_report = con->orig_error_report;
+ if (con->sock == NULL ||
+ kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) {
printk_ratelimited(KERN_ERR "dlm: node %d: socket error "
"sending to node %d, port %d, "
"sk_err=%d/%d\n", dlm_our_nodeid(),
con->nodeid, dlm_config.ci_tcp_port,
sk->sk_err, sk->sk_err_soft);
- return;
} else if (saddr.ss_family == AF_INET) {
struct sockaddr_in *sin4 = (struct sockaddr_in *)&saddr;
@@ -499,22 +510,54 @@ static void lowcomms_error_report(struct sock *sk)
dlm_config.ci_tcp_port, sk->sk_err,
sk->sk_err_soft);
}
- con->orig_error_report(sk);
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+ if (orig_report)
+ orig_report(sk);
+}
+
+/* Note: sk_callback_lock must be locked before calling this function. */
+static void save_callbacks(struct connection *con, struct sock *sk)
+{
+ lock_sock(sk);
+ con->orig_data_ready = sk->sk_data_ready;
+ con->orig_state_change = sk->sk_state_change;
+ con->orig_write_space = sk->sk_write_space;
+ con->orig_error_report = sk->sk_error_report;
+ release_sock(sk);
+}
+
+static void restore_callbacks(struct connection *con, struct sock *sk)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ lock_sock(sk);
+ sk->sk_user_data = NULL;
+ sk->sk_data_ready = con->orig_data_ready;
+ sk->sk_state_change = con->orig_state_change;
+ sk->sk_write_space = con->orig_write_space;
+ sk->sk_error_report = con->orig_error_report;
+ release_sock(sk);
+ write_unlock_bh(&sk->sk_callback_lock);
}
/* Make a socket active */
static void add_sock(struct socket *sock, struct connection *con)
{
+ struct sock *sk = sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
con->sock = sock;
+ sk->sk_user_data = con;
+ if (!test_bit(CF_IS_OTHERCON, &con->flags))
+ save_callbacks(con, sk);
/* Install a data_ready callback */
- con->sock->sk->sk_data_ready = lowcomms_data_ready;
- con->sock->sk->sk_write_space = lowcomms_write_space;
- con->sock->sk->sk_state_change = lowcomms_state_change;
- con->sock->sk->sk_user_data = con;
- con->sock->sk->sk_allocation = GFP_NOFS;
- con->orig_error_report = con->sock->sk->sk_error_report;
- con->sock->sk->sk_error_report = lowcomms_error_report;
+ sk->sk_data_ready = lowcomms_data_ready;
+ sk->sk_write_space = lowcomms_write_space;
+ sk->sk_state_change = lowcomms_state_change;
+ sk->sk_allocation = GFP_NOFS;
+ sk->sk_error_report = lowcomms_error_report;
+ write_unlock_bh(&sk->sk_callback_lock);
}
/* Add the port number to an IPv6 or 4 sockaddr and return the address
@@ -549,6 +592,8 @@ static void close_connection(struct connection *con, bool and_other,
mutex_lock(&con->sock_mutex);
if (con->sock) {
+ if (!test_bit(CF_IS_OTHERCON, &con->flags))
+ restore_callbacks(con, con->sock->sk);
sock_release(con->sock);
con->sock = NULL;
}
@@ -595,7 +640,7 @@ static int receive_from_sock(struct connection *con)
con->rx_page = alloc_page(GFP_ATOMIC);
if (con->rx_page == NULL)
goto out_resched;
- cbuf_init(&con->cb, PAGE_CACHE_SIZE);
+ cbuf_init(&con->cb, PAGE_SIZE);
}
/*
@@ -612,7 +657,7 @@ static int receive_from_sock(struct connection *con)
* buffer and the start of the currently used section (cb.base)
*/
if (cbuf_data(&con->cb) >= con->cb.base) {
- iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb);
+ iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb);
iov[1].iov_len = con->cb.base;
iov[1].iov_base = page_address(con->rx_page);
nvec = 2;
@@ -630,7 +675,7 @@ static int receive_from_sock(struct connection *con)
ret = dlm_process_incoming_buffer(con->nodeid,
page_address(con->rx_page),
con->cb.base, con->cb.len,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
if (ret == -EBADMSG) {
log_print("lowcomms: addr=%p, base=%u, len=%u, read=%d",
page_address(con->rx_page), con->cb.base,
@@ -1190,6 +1235,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con,
if (result < 0) {
log_print("Failed to set SO_REUSEADDR on socket: %d", result);
}
+ sock->sk->sk_user_data = con;
+
con->rx_action = tcp_accept_from_sock;
con->connect_action = tcp_connect_to_sock;
@@ -1271,6 +1318,7 @@ static int sctp_listen_for_all(void)
if (result < 0)
log_print("Could not set SCTP NODELAY error %d\n", result);
+ write_lock_bh(&sock->sk->sk_callback_lock);
/* Init con struct */
sock->sk->sk_user_data = con;
con->sock = sock;
@@ -1278,6 +1326,8 @@ static int sctp_listen_for_all(void)
con->rx_action = sctp_accept_from_sock;
con->connect_action = sctp_connect_to_sock;
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+
/* Bind to all addresses. */
if (sctp_bind_addrs(con, dlm_config.ci_tcp_port))
goto create_delsock;
@@ -1366,7 +1416,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
spin_lock(&con->writequeue_lock);
e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
if ((&e->list == &con->writequeue) ||
- (PAGE_CACHE_SIZE - e->end < len)) {
+ (PAGE_SIZE - e->end < len)) {
e = NULL;
} else {
offset = e->end;
@@ -1448,7 +1498,7 @@ static void send_to_sock(struct connection *con)
msg_flags);
if (ret == -EAGAIN || ret == 0) {
if (ret == -EAGAIN &&
- test_bit(SOCK_ASYNC_NOSPACE, &con->sock->flags) &&
+ test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) &&
!test_and_set_bit(CF_APP_LIMITED, &con->flags)) {
/* Notify TCP that we're limited by the
* application window size.
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 173b3873a4f4..58c2f4a21b7f 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -515,14 +515,9 @@ static ssize_t device_write(struct file *file, const char __user *buf,
if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
return -EINVAL;
- kbuf = kzalloc(count + 1, GFP_NOFS);
- if (!kbuf)
- return -ENOMEM;
-
- if (copy_from_user(kbuf, buf, count)) {
- error = -EFAULT;
- goto out_free;
- }
+ kbuf = memdup_user_nul(buf, count);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
if (check_version(kbuf)) {
error = -EBADE;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 80d6901493cf..d09cb4cdd09f 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -23,6 +23,8 @@
* 02111-1307, USA.
*/
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
@@ -30,7 +32,6 @@
#include <linux/compiler.h>
#include <linux/key.h>
#include <linux/namei.h>
-#include <linux/crypto.h>
#include <linux/file.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
@@ -74,6 +75,19 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size)
}
}
+static int ecryptfs_hash_digest(struct crypto_shash *tfm,
+ char *src, int len, char *dst)
+{
+ SHASH_DESC_ON_STACK(desc, tfm);
+ int err;
+
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = crypto_shash_digest(desc, src, len, dst);
+ shash_desc_zero(desc);
+ return err;
+}
+
/**
* ecryptfs_calculate_md5 - calculates the md5 of @src
* @dst: Pointer to 16 bytes of allocated memory
@@ -88,45 +102,26 @@ static int ecryptfs_calculate_md5(char *dst,
struct ecryptfs_crypt_stat *crypt_stat,
char *src, int len)
{
- struct scatterlist sg;
- struct hash_desc desc = {
- .tfm = crypt_stat->hash_tfm,
- .flags = CRYPTO_TFM_REQ_MAY_SLEEP
- };
+ struct crypto_shash *tfm;
int rc = 0;
mutex_lock(&crypt_stat->cs_hash_tfm_mutex);
- sg_init_one(&sg, (u8 *)src, len);
- if (!desc.tfm) {
- desc.tfm = crypto_alloc_hash(ECRYPTFS_DEFAULT_HASH, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(desc.tfm)) {
- rc = PTR_ERR(desc.tfm);
+ tfm = crypt_stat->hash_tfm;
+ if (!tfm) {
+ tfm = crypto_alloc_shash(ECRYPTFS_DEFAULT_HASH, 0, 0);
+ if (IS_ERR(tfm)) {
+ rc = PTR_ERR(tfm);
ecryptfs_printk(KERN_ERR, "Error attempting to "
"allocate crypto context; rc = [%d]\n",
rc);
goto out;
}
- crypt_stat->hash_tfm = desc.tfm;
- }
- rc = crypto_hash_init(&desc);
- if (rc) {
- printk(KERN_ERR
- "%s: Error initializing crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out;
+ crypt_stat->hash_tfm = tfm;
}
- rc = crypto_hash_update(&desc, &sg, len);
+ rc = ecryptfs_hash_digest(tfm, src, len, dst);
if (rc) {
printk(KERN_ERR
- "%s: Error updating crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out;
- }
- rc = crypto_hash_final(&desc, dst);
- if (rc) {
- printk(KERN_ERR
- "%s: Error finalizing crypto hash; rc = [%d]\n",
+ "%s: Error computing crypto hash; rc = [%d]\n",
__func__, rc);
goto out;
}
@@ -234,10 +229,8 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
{
struct ecryptfs_key_sig *key_sig, *key_sig_tmp;
- if (crypt_stat->tfm)
- crypto_free_ablkcipher(crypt_stat->tfm);
- if (crypt_stat->hash_tfm)
- crypto_free_hash(crypt_stat->hash_tfm);
+ crypto_free_skcipher(crypt_stat->tfm);
+ crypto_free_shash(crypt_stat->hash_tfm);
list_for_each_entry_safe(key_sig, key_sig_tmp,
&crypt_stat->keysig_list, crypt_stat_list) {
list_del(&key_sig->crypt_stat_list);
@@ -293,7 +286,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
pg = virt_to_page(addr);
offset = offset_in_page(addr);
sg_set_page(&sg[i], pg, 0, offset);
- remainder_of_page = PAGE_CACHE_SIZE - offset;
+ remainder_of_page = PAGE_SIZE - offset;
if (size >= remainder_of_page) {
sg[i].length = remainder_of_page;
addr += remainder_of_page;
@@ -342,7 +335,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
struct scatterlist *src_sg, int size,
unsigned char *iv, int op)
{
- struct ablkcipher_request *req = NULL;
+ struct skcipher_request *req = NULL;
struct extent_crypt_result ecr;
int rc = 0;
@@ -358,20 +351,20 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
init_completion(&ecr.completion);
mutex_lock(&crypt_stat->cs_tfm_mutex);
- req = ablkcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
+ req = skcipher_request_alloc(crypt_stat->tfm, GFP_NOFS);
if (!req) {
mutex_unlock(&crypt_stat->cs_tfm_mutex);
rc = -ENOMEM;
goto out;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
extent_crypt_complete, &ecr);
/* Consider doing this once, when the file is opened */
if (!(crypt_stat->flags & ECRYPTFS_KEY_SET)) {
- rc = crypto_ablkcipher_setkey(crypt_stat->tfm, crypt_stat->key,
- crypt_stat->key_size);
+ rc = crypto_skcipher_setkey(crypt_stat->tfm, crypt_stat->key,
+ crypt_stat->key_size);
if (rc) {
ecryptfs_printk(KERN_ERR,
"Error setting key; rc = [%d]\n",
@@ -383,9 +376,9 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
crypt_stat->flags |= ECRYPTFS_KEY_SET;
}
mutex_unlock(&crypt_stat->cs_tfm_mutex);
- ablkcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
- rc = op == ENCRYPT ? crypto_ablkcipher_encrypt(req) :
- crypto_ablkcipher_decrypt(req);
+ skcipher_request_set_crypt(req, src_sg, dst_sg, size, iv);
+ rc = op == ENCRYPT ? crypto_skcipher_encrypt(req) :
+ crypto_skcipher_decrypt(req);
if (rc == -EINPROGRESS || rc == -EBUSY) {
struct extent_crypt_result *ecr = req->base.data;
@@ -394,7 +387,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
reinit_completion(&ecr->completion);
}
out:
- ablkcipher_request_free(req);
+ skcipher_request_free(req);
return rc;
}
@@ -407,7 +400,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
struct page *page)
{
return ecryptfs_lower_header_size(crypt_stat) +
- ((loff_t)page->index << PAGE_CACHE_SHIFT);
+ ((loff_t)page->index << PAGE_SHIFT);
}
/**
@@ -435,7 +428,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
size_t extent_size = crypt_stat->extent_size;
int rc;
- extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
+ extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size));
rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
(extent_base + extent_offset));
if (rc) {
@@ -505,7 +498,7 @@ int ecryptfs_encrypt_page(struct page *page)
}
for (extent_offset = 0;
- extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
+ extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
extent_offset++) {
rc = crypt_extent(crypt_stat, enc_extent_page, page,
extent_offset, ENCRYPT);
@@ -519,7 +512,7 @@ int ecryptfs_encrypt_page(struct page *page)
lower_offset = lower_offset_for_page(crypt_stat, page);
enc_extent_virt = kmap(enc_extent_page);
rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
kunmap(enc_extent_page);
if (rc < 0) {
ecryptfs_printk(KERN_ERR,
@@ -567,7 +560,7 @@ int ecryptfs_decrypt_page(struct page *page)
lower_offset = lower_offset_for_page(crypt_stat, page);
page_virt = kmap(page);
- rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
+ rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
ecryptfs_inode);
kunmap(page);
if (rc < 0) {
@@ -578,7 +571,7 @@ int ecryptfs_decrypt_page(struct page *page)
}
for (extent_offset = 0;
- extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
+ extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
extent_offset++) {
rc = crypt_extent(crypt_stat, page, page,
extent_offset, DECRYPT);
@@ -622,7 +615,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
crypt_stat->cipher, "cbc");
if (rc)
goto out_unlock;
- crypt_stat->tfm = crypto_alloc_ablkcipher(full_alg_name, 0, 0);
+ crypt_stat->tfm = crypto_alloc_skcipher(full_alg_name, 0, 0);
if (IS_ERR(crypt_stat->tfm)) {
rc = PTR_ERR(crypt_stat->tfm);
crypt_stat->tfm = NULL;
@@ -631,7 +624,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
full_alg_name);
goto out_free;
}
- crypto_ablkcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ crypto_skcipher_set_flags(crypt_stat->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
rc = 0;
out_free:
kfree(full_alg_name);
@@ -666,11 +659,11 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
else {
- if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
+ if (PAGE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
crypt_stat->metadata_size =
ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
else
- crypt_stat->metadata_size = PAGE_CACHE_SIZE;
+ crypt_stat->metadata_size = PAGE_SIZE;
}
}
@@ -1449,7 +1442,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
ECRYPTFS_VALIDATE_HEADER_SIZE);
if (rc) {
/* metadata is not in the file header, so try xattrs */
- memset(page_virt, 0, PAGE_CACHE_SIZE);
+ memset(page_virt, 0, PAGE_SIZE);
rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
if (rc) {
printk(KERN_DEBUG "Valid eCryptfs headers not found in "
@@ -1482,7 +1475,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
}
out:
if (page_virt) {
- memset(page_virt, 0, PAGE_CACHE_SIZE);
+ memset(page_virt, 0, PAGE_SIZE);
kmem_cache_free(ecryptfs_header_cache, page_virt);
}
return rc;
@@ -1499,16 +1492,14 @@ out:
*/
static int
ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
- struct ecryptfs_crypt_stat *crypt_stat,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
{
int rc = 0;
filename->encrypted_filename = NULL;
filename->encrypted_filename_size = 0;
- if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
- || (mount_crypt_stat && (mount_crypt_stat->flags
- & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+ if (mount_crypt_stat && (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) {
size_t packet_size;
size_t remaining_bytes;
@@ -1591,7 +1582,7 @@ out:
* event, regardless of whether this function succeeds for fails.
*/
static int
-ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
+ecryptfs_process_key_cipher(struct crypto_skcipher **key_tfm,
char *cipher_name, size_t *key_size)
{
char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
@@ -1609,21 +1600,18 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
"ecb");
if (rc)
goto out;
- *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
+ *key_tfm = crypto_alloc_skcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(*key_tfm)) {
rc = PTR_ERR(*key_tfm);
printk(KERN_ERR "Unable to allocate crypto cipher with name "
"[%s]; rc = [%d]\n", full_alg_name, rc);
goto out;
}
- crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
- if (*key_size == 0) {
- struct blkcipher_alg *alg = crypto_blkcipher_alg(*key_tfm);
-
- *key_size = alg->max_keysize;
- }
+ crypto_skcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ if (*key_size == 0)
+ *key_size = crypto_skcipher_default_keysize(*key_tfm);
get_random_bytes(dummy_key, *key_size);
- rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
+ rc = crypto_skcipher_setkey(*key_tfm, dummy_key, *key_size);
if (rc) {
printk(KERN_ERR "Error attempting to set key of size [%zd] for "
"cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
@@ -1660,8 +1648,7 @@ int ecryptfs_destroy_crypto(void)
list_for_each_entry_safe(key_tfm, key_tfm_tmp, &key_tfm_list,
key_tfm_list) {
list_del(&key_tfm->key_tfm_list);
- if (key_tfm->key_tfm)
- crypto_free_blkcipher(key_tfm->key_tfm);
+ crypto_free_skcipher(key_tfm->key_tfm);
kmem_cache_free(ecryptfs_key_tfm_cache, key_tfm);
}
mutex_unlock(&key_tfm_list_mutex);
@@ -1747,7 +1734,7 @@ int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm)
* Searches for cached item first, and creates new if not found.
* Returns 0 on success, non-zero if adding new cipher failed
*/
-int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
+int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm,
struct mutex **tfm_mutex,
char *cipher_name)
{
@@ -1944,7 +1931,6 @@ out:
int ecryptfs_encrypt_and_encode_filename(
char **encoded_name,
size_t *encoded_name_size,
- struct ecryptfs_crypt_stat *crypt_stat,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
const char *name, size_t name_size)
{
@@ -1953,9 +1939,8 @@ int ecryptfs_encrypt_and_encode_filename(
(*encoded_name) = NULL;
(*encoded_name_size) = 0;
- if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
- || (mount_crypt_stat && (mount_crypt_stat->flags
- & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
+ if (mount_crypt_stat && (mount_crypt_stat->flags
+ & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
struct ecryptfs_filename *filename;
filename = kzalloc(sizeof(*filename), GFP_KERNEL);
@@ -1968,8 +1953,7 @@ int ecryptfs_encrypt_and_encode_filename(
}
filename->filename = (char *)name;
filename->filename_size = name_size;
- rc = ecryptfs_encrypt_filename(filename, crypt_stat,
- mount_crypt_stat);
+ rc = ecryptfs_encrypt_filename(filename, mount_crypt_stat);
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt "
"filename; rc = [%d]\n", __func__, rc);
@@ -1980,11 +1964,9 @@ int ecryptfs_encrypt_and_encode_filename(
NULL, &encoded_name_no_prefix_size,
filename->encrypted_filename,
filename->encrypted_filename_size);
- if ((crypt_stat && (crypt_stat->flags
- & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
- || (mount_crypt_stat
+ if (mount_crypt_stat
&& (mount_crypt_stat->flags
- & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+ & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))
(*encoded_name_size) =
(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+ encoded_name_no_prefix_size);
@@ -2002,11 +1984,9 @@ int ecryptfs_encrypt_and_encode_filename(
kfree(filename);
goto out;
}
- if ((crypt_stat && (crypt_stat->flags
- & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
- || (mount_crypt_stat
+ if (mount_crypt_stat
&& (mount_crypt_stat->flags
- & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+ & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)) {
memcpy((*encoded_name),
ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
@@ -2120,7 +2100,7 @@ out:
int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
{
- struct blkcipher_desc desc;
+ struct crypto_skcipher *tfm;
struct mutex *tfm_mutex;
size_t cipher_blocksize;
int rc;
@@ -2130,7 +2110,7 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
return 0;
}
- rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
mount_crypt_stat->global_default_fn_cipher_name);
if (unlikely(rc)) {
(*namelen) = 0;
@@ -2138,7 +2118,7 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
}
mutex_lock(tfm_mutex);
- cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm);
+ cipher_blocksize = crypto_skcipher_blocksize(tfm);
mutex_unlock(tfm_mutex);
/* Return an exact amount for the common cases */
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 7b39260c7bba..d123fbaa28e0 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -28,6 +28,7 @@
#ifndef ECRYPTFS_KERNEL_H
#define ECRYPTFS_KERNEL_H
+#include <crypto/skcipher.h>
#include <keys/user-type.h>
#include <keys/encrypted-type.h>
#include <linux/fs.h>
@@ -38,7 +39,6 @@
#include <linux/nsproxy.h>
#include <linux/backing-dev.h>
#include <linux/ecryptfs.h>
-#include <linux/crypto.h>
#define ECRYPTFS_DEFAULT_IV_BYTES 16
#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
@@ -233,9 +233,9 @@ struct ecryptfs_crypt_stat {
size_t extent_shift;
unsigned int extent_mask;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
- struct crypto_ablkcipher *tfm;
- struct crypto_hash *hash_tfm; /* Crypto context for generating
- * the initialization vectors */
+ struct crypto_skcipher *tfm;
+ struct crypto_shash *hash_tfm; /* Crypto context for generating
+ * the initialization vectors */
unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
@@ -309,7 +309,7 @@ struct ecryptfs_global_auth_tok {
* keeps a list of crypto API contexts around to use when needed.
*/
struct ecryptfs_key_tfm {
- struct crypto_blkcipher *key_tfm;
+ struct crypto_skcipher *key_tfm;
size_t key_size;
struct mutex key_tfm_mutex;
struct list_head key_tfm_list;
@@ -569,7 +569,6 @@ int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
int ecryptfs_encrypt_and_encode_filename(
char **encoded_name,
size_t *encoded_name_size,
- struct ecryptfs_crypt_stat *crypt_stat,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
const char *name, size_t name_size);
struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
@@ -659,7 +658,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name,
int ecryptfs_init_crypto(void);
int ecryptfs_destroy_crypto(void);
int ecryptfs_tfm_exists(char *cipher_name, struct ecryptfs_key_tfm **key_tfm);
-int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
+int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_skcipher **tfm,
struct mutex **tfm_mutex,
char *cipher_name);
int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2e47ba5d313..224b49e71aa4 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -29,7 +29,6 @@
#include <linux/dcache.h>
#include <linux/namei.h>
#include <linux/mount.h>
-#include <linux/crypto.h>
#include <linux/fs_stack.h>
#include <linux/slab.h>
#include <linux/xattr.h>
@@ -41,13 +40,13 @@ static struct dentry *lock_parent(struct dentry *dentry)
struct dentry *dir;
dir = dget_parent(dentry);
- mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
return dir;
}
static void unlock_dir(struct dentry *dir)
{
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
dput(dir);
}
@@ -282,9 +281,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
if (rc) {
ecryptfs_do_unlink(directory_inode, ecryptfs_dentry,
ecryptfs_inode);
- make_bad_inode(ecryptfs_inode);
- unlock_new_inode(ecryptfs_inode);
- iput(ecryptfs_inode);
+ iget_failed(ecryptfs_inode);
goto out;
}
unlock_new_inode(ecryptfs_inode);
@@ -399,11 +396,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
int rc = 0;
lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
- mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
- lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+ lower_dentry = lookup_one_len_unlocked(ecryptfs_dentry->d_name.name,
lower_dir_dentry,
ecryptfs_dentry->d_name.len);
- mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -421,18 +416,16 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
dput(lower_dentry);
rc = ecryptfs_encrypt_and_encode_filename(
&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
- NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+ mount_crypt_stat, ecryptfs_dentry->d_name.name,
ecryptfs_dentry->d_name.len);
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt and encode "
"filename; rc = [%d]\n", __func__, rc);
goto out;
}
- mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
- lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+ lower_dentry = lookup_one_len_unlocked(encrypted_and_encoded_name,
lower_dir_dentry,
encrypted_and_encoded_name_size);
- mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
if (IS_ERR(lower_dentry)) {
rc = PTR_ERR(lower_dentry);
ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -504,7 +497,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
dir->i_sb)->mount_crypt_stat;
rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
&encoded_symlen,
- NULL,
mount_crypt_stat, symname,
strlen(symname));
if (rc)
@@ -674,16 +666,24 @@ out:
return rc ? ERR_PTR(rc) : buf;
}
-static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *ecryptfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
size_t len;
- char *buf = ecryptfs_readlink_lower(dentry, &len);
+ char *buf;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ buf = ecryptfs_readlink_lower(dentry, &len);
if (IS_ERR(buf))
return buf;
fsstack_copy_attr_atime(d_inode(dentry),
d_inode(ecryptfs_dentry_to_lower(dentry)));
buf[len] = '\0';
- return *cookie = buf;
+ set_delayed_call(done, kfree_link, buf);
+ return buf;
}
/**
@@ -763,10 +763,10 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
} else { /* ia->ia_size < i_size_read(inode) */
/* We're chopping off all the pages down to the page
* in which ia->ia_size is located. Fill in the end of
- * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
- * PAGE_CACHE_SIZE with zeros. */
- size_t num_zeros = (PAGE_CACHE_SIZE
- - (ia->ia_size & ~PAGE_CACHE_MASK));
+ * that page from (ia->ia_size & ~PAGE_MASK) to
+ * PAGE_SIZE with zeros. */
+ size_t num_zeros = (PAGE_SIZE
+ - (ia->ia_size & ~PAGE_MASK));
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
truncate_setsize(inode, ia->ia_size);
@@ -863,9 +863,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dentry));
rc = notify_change(lower_dentry, &lower_ia, NULL);
- mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dentry));
}
return rc;
}
@@ -964,9 +964,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
lower_ia.ia_valid &= ~ATTR_MODE;
- mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dentry));
rc = notify_change(lower_dentry, &lower_ia, NULL);
- mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dentry));
out:
fsstack_copy_attr_all(inode, lower_inode);
return rc;
@@ -1042,10 +1042,10 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
rc = -EOPNOTSUPP;
goto out;
}
- mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dentry));
rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value,
size);
- mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dentry));
out:
return rc;
}
@@ -1069,9 +1069,9 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
rc = -EOPNOTSUPP;
goto out;
}
- mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dentry));
rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size);
- mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dentry));
out:
return rc;
}
@@ -1086,17 +1086,16 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
rc = -EOPNOTSUPP;
goto out;
}
- mutex_lock(&d_inode(lower_dentry)->i_mutex);
+ inode_lock(d_inode(lower_dentry));
rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name);
- mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+ inode_unlock(d_inode(lower_dentry));
out:
return rc;
}
const struct inode_operations ecryptfs_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = ecryptfs_follow_link,
- .put_link = kfree_put_link,
+ .get_link = ecryptfs_get_link,
.permission = ecryptfs_permission,
.setattr = ecryptfs_setattr,
.getattr = ecryptfs_getattr_link,
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 6bd67e2011f0..3cf1546dca82 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -25,11 +25,12 @@
* 02111-1307, USA.
*/
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
#include <linux/string.h>
#include <linux/pagemap.h>
#include <linux/key.h>
#include <linux/random.h>
-#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include "ecryptfs_kernel.h"
@@ -601,12 +602,13 @@ struct ecryptfs_write_tag_70_packet_silly_stack {
struct ecryptfs_auth_tok *auth_tok;
struct scatterlist src_sg[2];
struct scatterlist dst_sg[2];
- struct blkcipher_desc desc;
+ struct crypto_skcipher *skcipher_tfm;
+ struct skcipher_request *skcipher_req;
char iv[ECRYPTFS_MAX_IV_BYTES];
char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
- struct hash_desc hash_desc;
- struct scatterlist hash_sg;
+ struct crypto_shash *hash_tfm;
+ struct shash_desc *hash_desc;
};
/**
@@ -629,14 +631,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
struct key *auth_tok_key = NULL;
int rc = 0;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
"[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
- rc = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
- s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
(*packet_size) = 0;
rc = ecryptfs_find_auth_tok_for_sig(
&auth_tok_key,
@@ -649,7 +649,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
goto out;
}
rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
- &s->desc.tfm,
+ &s->skcipher_tfm,
&s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
if (unlikely(rc)) {
printk(KERN_ERR "Internal error whilst attempting to get "
@@ -658,7 +658,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
goto out;
}
mutex_lock(s->tfm_mutex);
- s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+ s->block_size = crypto_skcipher_blocksize(s->skcipher_tfm);
/* Plus one for the \0 separator between the random prefix
* and the plaintext filename */
s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
@@ -691,6 +691,19 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
rc = -EINVAL;
goto out_unlock;
}
+
+ s->skcipher_req = skcipher_request_alloc(s->skcipher_tfm, GFP_KERNEL);
+ if (!s->skcipher_req) {
+ printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+ "skcipher_request_alloc for %s\n", __func__,
+ crypto_skcipher_driver_name(s->skcipher_tfm));
+ rc = -ENOMEM;
+ goto out_unlock;
+ }
+
+ skcipher_request_set_callback(s->skcipher_req,
+ CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+
s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
GFP_KERNEL);
if (!s->block_aligned_filename) {
@@ -700,7 +713,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
rc = -ENOMEM;
goto out_unlock;
}
- s->i = 0;
dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
rc = ecryptfs_write_packet_length(&dest[s->i],
(ECRYPTFS_SIG_SIZE
@@ -738,40 +750,36 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
"password tokens\n", __func__);
goto out_free_unlock;
}
- sg_init_one(
- &s->hash_sg,
- (u8 *)s->auth_tok->token.password.session_key_encryption_key,
- s->auth_tok->token.password.session_key_encryption_key_bytes);
- s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(s->hash_desc.tfm)) {
- rc = PTR_ERR(s->hash_desc.tfm);
+ s->hash_tfm = crypto_alloc_shash(ECRYPTFS_TAG_70_DIGEST, 0, 0);
+ if (IS_ERR(s->hash_tfm)) {
+ rc = PTR_ERR(s->hash_tfm);
printk(KERN_ERR "%s: Error attempting to "
"allocate hash crypto context; rc = [%d]\n",
__func__, rc);
goto out_free_unlock;
}
- rc = crypto_hash_init(&s->hash_desc);
- if (rc) {
- printk(KERN_ERR
- "%s: Error initializing crypto hash; rc = [%d]\n",
- __func__, rc);
- goto out_release_free_unlock;
- }
- rc = crypto_hash_update(
- &s->hash_desc, &s->hash_sg,
- s->auth_tok->token.password.session_key_encryption_key_bytes);
- if (rc) {
- printk(KERN_ERR
- "%s: Error updating crypto hash; rc = [%d]\n",
- __func__, rc);
+
+ s->hash_desc = kmalloc(sizeof(*s->hash_desc) +
+ crypto_shash_descsize(s->hash_tfm), GFP_KERNEL);
+ if (!s->hash_desc) {
+ printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+ "kmalloc [%zd] bytes\n", __func__,
+ sizeof(*s->hash_desc) +
+ crypto_shash_descsize(s->hash_tfm));
+ rc = -ENOMEM;
goto out_release_free_unlock;
}
- rc = crypto_hash_final(&s->hash_desc, s->hash);
+
+ s->hash_desc->tfm = s->hash_tfm;
+ s->hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ rc = crypto_shash_digest(s->hash_desc,
+ (u8 *)s->auth_tok->token.password.session_key_encryption_key,
+ s->auth_tok->token.password.session_key_encryption_key_bytes,
+ s->hash);
if (rc) {
printk(KERN_ERR
- "%s: Error finalizing crypto hash; rc = [%d]\n",
+ "%s: Error computing crypto hash; rc = [%d]\n",
__func__, rc);
goto out_release_free_unlock;
}
@@ -780,27 +788,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
== (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
- sg_init_one(&s->hash_sg, (u8 *)s->hash,
- ECRYPTFS_TAG_70_DIGEST_SIZE);
- rc = crypto_hash_init(&s->hash_desc);
- if (rc) {
- printk(KERN_ERR
- "%s: Error initializing crypto hash; "
- "rc = [%d]\n", __func__, rc);
- goto out_release_free_unlock;
- }
- rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
- ECRYPTFS_TAG_70_DIGEST_SIZE);
+ rc = crypto_shash_digest(s->hash_desc, (u8 *)s->hash,
+ ECRYPTFS_TAG_70_DIGEST_SIZE,
+ s->tmp_hash);
if (rc) {
printk(KERN_ERR
- "%s: Error updating crypto hash; "
- "rc = [%d]\n", __func__, rc);
- goto out_release_free_unlock;
- }
- rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
- if (rc) {
- printk(KERN_ERR
- "%s: Error finalizing crypto hash; "
+ "%s: Error computing crypto hash; "
"rc = [%d]\n", __func__, rc);
goto out_release_free_unlock;
}
@@ -834,10 +827,8 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
* of the IV here, so we just use 0's for the IV. Note the
* constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
* >= ECRYPTFS_MAX_IV_BYTES. */
- memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
- s->desc.info = s->iv;
- rc = crypto_blkcipher_setkey(
- s->desc.tfm,
+ rc = crypto_skcipher_setkey(
+ s->skcipher_tfm,
s->auth_tok->token.password.session_key_encryption_key,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
if (rc < 0) {
@@ -850,8 +841,9 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
goto out_release_free_unlock;
}
- rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg,
- s->block_aligned_filename_size);
+ skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
+ s->block_aligned_filename_size, s->iv);
+ rc = crypto_skcipher_encrypt(s->skcipher_req);
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt filename; "
"rc = [%d]\n", __func__, rc);
@@ -861,7 +853,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
(*packet_size) = s->i;
(*remaining_bytes) -= (*packet_size);
out_release_free_unlock:
- crypto_free_hash(s->hash_desc.tfm);
+ crypto_free_shash(s->hash_tfm);
out_free_unlock:
kzfree(s->block_aligned_filename);
out_unlock:
@@ -871,6 +863,8 @@ out:
up_write(&(auth_tok_key->sem));
key_put(auth_tok_key);
}
+ skcipher_request_free(s->skcipher_req);
+ kzfree(s->hash_desc);
kfree(s);
return rc;
}
@@ -888,7 +882,8 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
struct ecryptfs_auth_tok *auth_tok;
struct scatterlist src_sg[2];
struct scatterlist dst_sg[2];
- struct blkcipher_desc desc;
+ struct crypto_skcipher *skcipher_tfm;
+ struct skcipher_request *skcipher_req;
char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
char iv[ECRYPTFS_MAX_IV_BYTES];
char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
@@ -922,14 +917,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
(*packet_size) = 0;
(*filename_size) = 0;
(*filename) = NULL;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
"[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
- rc = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
- s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
"at least [%d]\n", __func__, max_packet_size,
@@ -992,7 +985,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
rc);
goto out;
}
- rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->skcipher_tfm,
&s->tfm_mutex,
s->cipher_string);
if (unlikely(rc)) {
@@ -1030,12 +1023,23 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
__func__, rc, s->block_aligned_filename_size);
goto out_free_unlock;
}
+
+ s->skcipher_req = skcipher_request_alloc(s->skcipher_tfm, GFP_KERNEL);
+ if (!s->skcipher_req) {
+ printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+ "skcipher_request_alloc for %s\n", __func__,
+ crypto_skcipher_driver_name(s->skcipher_tfm));
+ rc = -ENOMEM;
+ goto out_free_unlock;
+ }
+
+ skcipher_request_set_callback(s->skcipher_req,
+ CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+
/* The characters in the first block effectively do the job of
* the IV here, so we just use 0's for the IV. Note the
* constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
* >= ECRYPTFS_MAX_IV_BYTES. */
- memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
- s->desc.info = s->iv;
/* TODO: Support other key modules than passphrase for
* filename encryption */
if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
@@ -1044,8 +1048,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
"password tokens\n", __func__);
goto out_free_unlock;
}
- rc = crypto_blkcipher_setkey(
- s->desc.tfm,
+ rc = crypto_skcipher_setkey(
+ s->skcipher_tfm,
s->auth_tok->token.password.session_key_encryption_key,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
if (rc < 0) {
@@ -1058,14 +1062,14 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
mount_crypt_stat->global_default_fn_cipher_key_bytes);
goto out_free_unlock;
}
- rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg,
- s->block_aligned_filename_size);
+ skcipher_request_set_crypt(s->skcipher_req, s->src_sg, s->dst_sg,
+ s->block_aligned_filename_size, s->iv);
+ rc = crypto_skcipher_decrypt(s->skcipher_req);
if (rc) {
printk(KERN_ERR "%s: Error attempting to decrypt filename; "
"rc = [%d]\n", __func__, rc);
goto out_free_unlock;
}
- s->i = 0;
while (s->decrypted_filename[s->i] != '\0'
&& s->i < s->block_aligned_filename_size)
s->i++;
@@ -1108,6 +1112,7 @@ out:
up_write(&(auth_tok_key->sem));
key_put(auth_tok_key);
}
+ skcipher_request_free(s->skcipher_req);
kfree(s);
return rc;
}
@@ -1667,9 +1672,8 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
struct scatterlist dst_sg[2];
struct scatterlist src_sg[2];
struct mutex *tfm_mutex;
- struct blkcipher_desc desc = {
- .flags = CRYPTO_TFM_REQ_MAY_SLEEP
- };
+ struct crypto_skcipher *tfm;
+ struct skcipher_request *req = NULL;
int rc = 0;
if (unlikely(ecryptfs_verbosity > 0)) {
@@ -1680,7 +1684,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
auth_tok->token.password.session_key_encryption_key,
auth_tok->token.password.session_key_encryption_key_bytes);
}
- rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
crypt_stat->cipher);
if (unlikely(rc)) {
printk(KERN_ERR "Internal error whilst attempting to get "
@@ -1711,8 +1715,20 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
goto out;
}
mutex_lock(tfm_mutex);
- rc = crypto_blkcipher_setkey(
- desc.tfm, auth_tok->token.password.session_key_encryption_key,
+ req = skcipher_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ mutex_unlock(tfm_mutex);
+ printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+ "skcipher_request_alloc for %s\n", __func__,
+ crypto_skcipher_driver_name(tfm));
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+ rc = crypto_skcipher_setkey(
+ tfm, auth_tok->token.password.session_key_encryption_key,
crypt_stat->key_size);
if (unlikely(rc < 0)) {
mutex_unlock(tfm_mutex);
@@ -1720,8 +1736,10 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
rc = -EINVAL;
goto out;
}
- rc = crypto_blkcipher_decrypt(&desc, dst_sg, src_sg,
- auth_tok->session_key.encrypted_key_size);
+ skcipher_request_set_crypt(req, src_sg, dst_sg,
+ auth_tok->session_key.encrypted_key_size,
+ NULL);
+ rc = crypto_skcipher_decrypt(req);
mutex_unlock(tfm_mutex);
if (unlikely(rc)) {
printk(KERN_ERR "Error decrypting; rc = [%d]\n", rc);
@@ -1738,6 +1756,7 @@ decrypt_passphrase_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
crypt_stat->key_size);
}
out:
+ skcipher_request_free(req);
return rc;
}
@@ -1779,7 +1798,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
* added the our &auth_tok_list */
next_packet_is_auth_tok_packet = 1;
while (next_packet_is_auth_tok_packet) {
- size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i);
+ size_t max_packet_size = ((PAGE_SIZE - 8) - i);
switch (src[i]) {
case ECRYPTFS_TAG_3_PACKET_TYPE:
@@ -2191,16 +2210,14 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
size_t max_packet_size;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
crypt_stat->mount_crypt_stat;
- struct blkcipher_desc desc = {
- .tfm = NULL,
- .flags = CRYPTO_TFM_REQ_MAY_SLEEP
- };
+ struct crypto_skcipher *tfm;
+ struct skcipher_request *req;
int rc = 0;
(*packet_size) = 0;
ecryptfs_from_hex(key_rec->sig, auth_tok->token.password.signature,
ECRYPTFS_SIG_SIZE);
- rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&tfm, &tfm_mutex,
crypt_stat->cipher);
if (unlikely(rc)) {
printk(KERN_ERR "Internal error whilst attempting to get "
@@ -2209,12 +2226,11 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
goto out;
}
if (mount_crypt_stat->global_default_cipher_key_size == 0) {
- struct blkcipher_alg *alg = crypto_blkcipher_alg(desc.tfm);
-
printk(KERN_WARNING "No key size specified at mount; "
- "defaulting to [%d]\n", alg->max_keysize);
+ "defaulting to [%d]\n",
+ crypto_skcipher_default_keysize(tfm));
mount_crypt_stat->global_default_cipher_key_size =
- alg->max_keysize;
+ crypto_skcipher_default_keysize(tfm);
}
if (crypt_stat->key_size == 0)
crypt_stat->key_size =
@@ -2284,20 +2300,36 @@ write_tag_3_packet(char *dest, size_t *remaining_bytes,
goto out;
}
mutex_lock(tfm_mutex);
- rc = crypto_blkcipher_setkey(desc.tfm, session_key_encryption_key,
- crypt_stat->key_size);
+ rc = crypto_skcipher_setkey(tfm, session_key_encryption_key,
+ crypt_stat->key_size);
if (rc < 0) {
mutex_unlock(tfm_mutex);
ecryptfs_printk(KERN_ERR, "Error setting key for crypto "
"context; rc = [%d]\n", rc);
goto out;
}
+
+ req = skcipher_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ mutex_unlock(tfm_mutex);
+ ecryptfs_printk(KERN_ERR, "Out of kernel memory whilst "
+ "attempting to skcipher_request_alloc for "
+ "%s\n", crypto_skcipher_driver_name(tfm));
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+
rc = 0;
ecryptfs_printk(KERN_DEBUG, "Encrypting [%zd] bytes of the key\n",
crypt_stat->key_size);
- rc = crypto_blkcipher_encrypt(&desc, dst_sg, src_sg,
- (*key_rec).enc_key_size);
+ skcipher_request_set_crypt(req, src_sg, dst_sg,
+ (*key_rec).enc_key_size, NULL);
+ rc = crypto_skcipher_encrypt(req);
mutex_unlock(tfm_mutex);
+ skcipher_request_free(req);
if (rc) {
printk(KERN_ERR "Error encrypting; rc = [%d]\n", rc);
goto out;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 4f4d0474bee9..1698132d0e57 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -29,7 +29,6 @@
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/skbuff.h>
-#include <linux/crypto.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/key.h>
@@ -663,6 +662,7 @@ static struct ecryptfs_cache_info {
struct kmem_cache **cache;
const char *name;
size_t size;
+ unsigned long flags;
void (*ctor)(void *obj);
} ecryptfs_cache_infos[] = {
{
@@ -684,6 +684,7 @@ static struct ecryptfs_cache_info {
.cache = &ecryptfs_inode_info_cache,
.name = "ecryptfs_inode_cache",
.size = sizeof(struct ecryptfs_inode_info),
+ .flags = SLAB_ACCOUNT,
.ctor = inode_info_init_once,
},
{
@@ -694,12 +695,12 @@ static struct ecryptfs_cache_info {
{
.cache = &ecryptfs_header_cache,
.name = "ecryptfs_headers",
- .size = PAGE_CACHE_SIZE,
+ .size = PAGE_SIZE,
},
{
.cache = &ecryptfs_xattr_cache,
.name = "ecryptfs_xattr_cache",
- .size = PAGE_CACHE_SIZE,
+ .size = PAGE_SIZE,
},
{
.cache = &ecryptfs_key_record_cache,
@@ -755,8 +756,8 @@ static int ecryptfs_init_kmem_caches(void)
struct ecryptfs_cache_info *info;
info = &ecryptfs_cache_infos[i];
- *(info->cache) = kmem_cache_create(info->name, info->size,
- 0, SLAB_HWCACHE_ALIGN, info->ctor);
+ *(info->cache) = kmem_cache_create(info->name, info->size, 0,
+ SLAB_HWCACHE_ALIGN | info->flags, info->ctor);
if (!*(info->cache)) {
ecryptfs_free_kmem_caches();
ecryptfs_printk(KERN_WARNING, "%s: "
@@ -817,7 +818,7 @@ static int __init ecryptfs_init(void)
{
int rc;
- if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) {
+ if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_SIZE) {
rc = -EINVAL;
ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
"larger than the host's page size, and so "
@@ -825,7 +826,7 @@ static int __init ecryptfs_init(void)
"default eCryptfs extent size is [%u] bytes; "
"the page size is [%lu] bytes.\n",
ECRYPTFS_DEFAULT_EXTENT_SIZE,
- (unsigned long)PAGE_CACHE_SIZE);
+ (unsigned long)PAGE_SIZE);
goto out;
}
rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index caba848ac763..e6b1d80952b9 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -30,7 +30,6 @@
#include <linux/page-flags.h>
#include <linux/mount.h>
#include <linux/file.h>
-#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <asm/unaligned.h>
@@ -123,7 +122,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
struct ecryptfs_crypt_stat *crypt_stat)
{
loff_t extent_num_in_page = 0;
- loff_t num_extents_per_page = (PAGE_CACHE_SIZE
+ loff_t num_extents_per_page = (PAGE_SIZE
/ crypt_stat->extent_size);
int rc = 0;
@@ -139,7 +138,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
char *page_virt;
page_virt = kmap_atomic(page);
- memset(page_virt, 0, PAGE_CACHE_SIZE);
+ memset(page_virt, 0, PAGE_SIZE);
/* TODO: Support more than one header extent */
if (view_extent_num == 0) {
size_t written;
@@ -165,8 +164,8 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
- crypt_stat->metadata_size);
rc = ecryptfs_read_lower_page_segment(
- page, (lower_offset >> PAGE_CACHE_SHIFT),
- (lower_offset & ~PAGE_CACHE_MASK),
+ page, (lower_offset >> PAGE_SHIFT),
+ (lower_offset & ~PAGE_MASK),
crypt_stat->extent_size, page->mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
@@ -199,7 +198,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page)
if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
- PAGE_CACHE_SIZE,
+ PAGE_SIZE,
page->mapping->host);
} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
@@ -216,7 +215,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page)
} else {
rc = ecryptfs_read_lower_page_segment(
- page, page->index, 0, PAGE_CACHE_SIZE,
+ page, page->index, 0, PAGE_SIZE,
page->mapping->host);
if (rc) {
printk(KERN_ERR "Error reading page; rc = "
@@ -251,12 +250,12 @@ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
struct inode *inode = page->mapping->host;
int end_byte_in_page;
- if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index)
+ if ((i_size_read(inode) / PAGE_SIZE) != page->index)
goto out;
- end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
+ end_byte_in_page = i_size_read(inode) % PAGE_SIZE;
if (to > end_byte_in_page)
end_byte_in_page = to;
- zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE);
+ zero_user_segment(page, end_byte_in_page, PAGE_SIZE);
out:
return 0;
}
@@ -280,7 +279,7 @@ static int ecryptfs_write_begin(struct file *file,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
loff_t prev_page_end_size;
int rc = 0;
@@ -290,14 +289,14 @@ static int ecryptfs_write_begin(struct file *file,
return -ENOMEM;
*pagep = page;
- prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
+ prev_page_end_size = ((loff_t)index << PAGE_SHIFT);
if (!PageUptodate(page)) {
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(mapping->host)->crypt_stat;
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_read_lower_page_segment(
- page, index, 0, PAGE_CACHE_SIZE, mapping->host);
+ page, index, 0, PAGE_SIZE, mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
"lower page segment; rc = [%d]\n",
@@ -323,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file,
SetPageUptodate(page);
} else {
rc = ecryptfs_read_lower_page_segment(
- page, index, 0, PAGE_CACHE_SIZE,
+ page, index, 0, PAGE_SIZE,
mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error reading "
@@ -337,9 +336,9 @@ static int ecryptfs_write_begin(struct file *file,
} else {
if (prev_page_end_size
>= i_size_read(page->mapping->host)) {
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
SetPageUptodate(page);
- } else if (len < PAGE_CACHE_SIZE) {
+ } else if (len < PAGE_SIZE) {
rc = ecryptfs_decrypt_page(page);
if (rc) {
printk(KERN_ERR "%s: Error decrypting "
@@ -372,11 +371,11 @@ static int ecryptfs_write_begin(struct file *file,
* of page? Zero it out. */
if ((i_size_read(mapping->host) == prev_page_end_size)
&& (pos != 0))
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
out:
if (unlikely(rc)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
*pagep = NULL;
}
return rc;
@@ -436,15 +435,15 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
rc = -ENOMEM;
goto out;
}
- mutex_lock(&lower_inode->i_mutex);
+ inode_lock(lower_inode);
size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
- xattr_virt, PAGE_CACHE_SIZE);
+ xattr_virt, PAGE_SIZE);
if (size < 0)
size = 8;
put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
xattr_virt, size, 0);
- mutex_unlock(&lower_inode->i_mutex);
+ inode_unlock(lower_inode);
if (rc)
printk(KERN_ERR "Error whilst attempting to write inode size "
"to lower file xattr; rc = [%d]\n", rc);
@@ -480,8 +479,8 @@ static int ecryptfs_write_end(struct file *file,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ pgoff_t index = pos >> PAGE_SHIFT;
+ unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + copied;
struct inode *ecryptfs_inode = mapping->host;
struct ecryptfs_crypt_stat *crypt_stat =
@@ -501,7 +500,7 @@ static int ecryptfs_write_end(struct file *file,
goto out;
}
if (!PageUptodate(page)) {
- if (copied < PAGE_CACHE_SIZE) {
+ if (copied < PAGE_SIZE) {
rc = 0;
goto out;
}
@@ -534,7 +533,7 @@ static int ecryptfs_write_end(struct file *file,
rc = copied;
out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return rc;
}
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 09fe622274e4..158a3a39f82d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -74,7 +74,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
loff_t offset;
int rc;
- offset = ((((loff_t)page_for_lower->index) << PAGE_CACHE_SHIFT)
+ offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT)
+ offset_in_page);
virt = kmap(page_for_lower);
rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
@@ -123,9 +123,9 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
else
pos = offset;
while (pos < (offset + size)) {
- pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
- size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
- size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
+ pgoff_t ecryptfs_page_idx = (pos >> PAGE_SHIFT);
+ size_t start_offset_in_page = (pos & ~PAGE_MASK);
+ size_t num_bytes = (PAGE_SIZE - start_offset_in_page);
loff_t total_remaining_bytes = ((offset + size) - pos);
if (fatal_signal_pending(current)) {
@@ -165,7 +165,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
* Fill in zero values to the end of the page */
memset(((char *)ecryptfs_page_virt
+ start_offset_in_page), 0,
- PAGE_CACHE_SIZE - start_offset_in_page);
+ PAGE_SIZE - start_offset_in_page);
}
/* pos >= offset, we are now writing the data request */
@@ -186,7 +186,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
ecryptfs_page,
start_offset_in_page,
data_offset);
- page_cache_release(ecryptfs_page);
+ put_page(ecryptfs_page);
if (rc) {
printk(KERN_ERR "%s: Error encrypting "
"page; rc = [%d]\n", __func__, rc);
@@ -262,7 +262,7 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
loff_t offset;
int rc;
- offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page);
+ offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
virt = kmap(page_for_ecryptfs);
rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
if (rc > 0)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index afa1b81c3418..77a486d3a51b 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -29,7 +29,6 @@
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/file.h>
-#include <linux/crypto.h>
#include <linux/statfs.h>
#include <linux/magic.h>
#include "ecryptfs_kernel.h"
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 90001da9abfd..d48e0d261d78 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -10,6 +10,7 @@
#include <linux/efi.h>
#include <linux/fs.h>
#include <linux/slab.h>
+#include <linux/mount.h>
#include "internal.h"
@@ -50,9 +51,9 @@ static ssize_t efivarfs_file_write(struct file *file,
d_delete(file->f_path.dentry);
dput(file->f_path.dentry);
} else {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
i_size_write(inode, datasize + sizeof(attributes));
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
bytes = count;
@@ -103,9 +104,78 @@ out_free:
return size;
}
+static int
+efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_mapping->host;
+ unsigned int i_flags;
+ unsigned int flags = 0;
+
+ i_flags = inode->i_flags;
+ if (i_flags & S_IMMUTABLE)
+ flags |= FS_IMMUTABLE_FL;
+
+ if (copy_to_user(arg, &flags, sizeof(flags)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+efivarfs_ioc_setxflags(struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_mapping->host;
+ unsigned int flags;
+ unsigned int i_flags = 0;
+ int error;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (copy_from_user(&flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ if (flags & ~FS_IMMUTABLE_FL)
+ return -EOPNOTSUPP;
+
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+
+ if (flags & FS_IMMUTABLE_FL)
+ i_flags |= S_IMMUTABLE;
+
+
+ error = mnt_want_write_file(file);
+ if (error)
+ return error;
+
+ inode_lock(inode);
+ inode_set_flags(inode, i_flags, S_IMMUTABLE);
+ inode_unlock(inode);
+
+ mnt_drop_write_file(file);
+
+ return 0;
+}
+
+long
+efivarfs_file_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+ void __user *arg = (void __user *)p;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ return efivarfs_ioc_getxflags(file, arg);
+ case FS_IOC_SETFLAGS:
+ return efivarfs_ioc_setxflags(file, arg);
+ }
+
+ return -ENOTTY;
+}
+
const struct file_operations efivarfs_file_operations = {
.open = simple_open,
.read = efivarfs_file_read,
.write = efivarfs_file_write,
.llseek = no_llseek,
+ .unlocked_ioctl = efivarfs_file_ioctl,
};
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 3381b9da9ee6..e2ab6d0497f2 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -15,7 +15,8 @@
#include "internal.h"
struct inode *efivarfs_get_inode(struct super_block *sb,
- const struct inode *dir, int mode, dev_t dev)
+ const struct inode *dir, int mode,
+ dev_t dev, bool is_removable)
{
struct inode *inode = new_inode(sb);
@@ -23,6 +24,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_fop = &efivarfs_file_operations;
@@ -102,22 +104,17 @@ static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid)
static int efivarfs_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool excl)
{
- struct inode *inode;
+ struct inode *inode = NULL;
struct efivar_entry *var;
int namelen, i = 0, err = 0;
+ bool is_removable = false;
if (!efivarfs_valid_name(dentry->d_name.name, dentry->d_name.len))
return -EINVAL;
- inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0);
- if (!inode)
- return -ENOMEM;
-
var = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
- if (!var) {
- err = -ENOMEM;
- goto out;
- }
+ if (!var)
+ return -ENOMEM;
/* length of the variable name itself: remove GUID and separator */
namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
@@ -125,6 +122,16 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
&var->var.VendorGuid);
+ if (efivar_variable_is_removable(var->var.VendorGuid,
+ dentry->d_name.name, namelen))
+ is_removable = true;
+
+ inode = efivarfs_get_inode(dir->i_sb, dir, mode, 0, is_removable);
+ if (!inode) {
+ err = -ENOMEM;
+ goto out;
+ }
+
for (i = 0; i < namelen; i++)
var->var.VariableName[i] = dentry->d_name.name[i];
@@ -138,7 +145,8 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
out:
if (err) {
kfree(var);
- iput(inode);
+ if (inode)
+ iput(inode);
}
return err;
}
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index b5ff16addb7c..b4505188e799 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -15,7 +15,8 @@ extern const struct file_operations efivarfs_file_operations;
extern const struct inode_operations efivarfs_dir_inode_operations;
extern bool efivarfs_valid_name(const char *str, int len);
extern struct inode *efivarfs_get_inode(struct super_block *sb,
- const struct inode *dir, int mode, dev_t dev);
+ const struct inode *dir, int mode, dev_t dev,
+ bool is_removable);
extern struct list_head efivarfs_list;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 86a2121828c3..553c5d2db4a4 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -118,8 +118,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
struct dentry *dentry, *root = sb->s_root;
unsigned long size = 0;
char *name;
- int len, i;
+ int len;
int err = -ENOMEM;
+ bool is_removable = false;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
@@ -128,15 +129,17 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
memcpy(entry->var.VariableName, name16, name_size);
memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
- len = ucs2_strlen(entry->var.VariableName);
+ len = ucs2_utf8size(entry->var.VariableName);
/* name, plus '-', plus GUID, plus NUL*/
name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
if (!name)
goto fail;
- for (i = 0; i < len; i++)
- name[i] = entry->var.VariableName[i] & 0xFF;
+ ucs2_as_utf8(name, entry->var.VariableName, len);
+
+ if (efivar_variable_is_removable(entry->var.VendorGuid, name, len))
+ is_removable = true;
name[len] = '-';
@@ -144,7 +147,8 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
- inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0);
+ inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
+ is_removable);
if (!inode)
goto fail_name;
@@ -160,10 +164,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
efivar_entry_size(entry, &size);
efivar_entry_add(entry, &efivarfs_list);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
inode->i_private = entry;
i_size_write(inode, size + sizeof(entry->var.Attributes));
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
d_add(dentry, inode);
return 0;
@@ -193,14 +197,14 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
efivarfs_sb = sb;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = EFIVARFS_MAGIC;
sb->s_op = &efivarfs_ops;
sb->s_d_op = &efivarfs_d_ops;
sb->s_time_gran = 1;
- inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+ inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true);
if (!inode)
return -ENOMEM;
inode->i_op = &efivarfs_dir_inode_operations;
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 079d20306ee1..cdf0872382af 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -151,6 +151,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &efs_symlink_aops;
break;
case S_IFCHR:
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c8411a30f7da..cb68dac4f9d3 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -94,9 +94,9 @@ static void init_once(void *foo)
static int __init init_inodecache(void)
{
efs_inode_cachep = kmem_cache_create("efs_inode_cache",
- sizeof(struct efs_inode_info),
- 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- init_once);
+ sizeof(struct efs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, init_once);
if (efs_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 75117d0dac2b..4870cc82deb0 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -13,7 +13,7 @@
static int efs_symlink_readpage(struct file *file, struct page *page)
{
- char *link = kmap(page);
+ char *link = page_address(page);
struct buffer_head * bh;
struct inode * inode = page->mapping->host;
efs_block_t size = inode->i_size;
@@ -39,12 +39,10 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
}
link[size] = '\0';
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return err;
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ed70cf9fdc7b..1231cd1999d8 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -121,8 +121,46 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
u64 count;
poll_wait(file, &ctx->wqh, wait);
- smp_rmb();
- count = ctx->count;
+
+ /*
+ * All writes to ctx->count occur within ctx->wqh.lock. This read
+ * can be done outside ctx->wqh.lock because we know that poll_wait
+ * takes that lock (through add_wait_queue) if our caller will sleep.
+ *
+ * The read _can_ therefore seep into add_wait_queue's critical
+ * section, but cannot move above it! add_wait_queue's spin_lock acts
+ * as an acquire barrier and ensures that the read be ordered properly
+ * against the writes. The following CAN happen and is safe:
+ *
+ * poll write
+ * ----------------- ------------
+ * lock ctx->wqh.lock (in poll_wait)
+ * count = ctx->count
+ * __add_wait_queue
+ * unlock ctx->wqh.lock
+ * lock ctx->qwh.lock
+ * ctx->count += n
+ * if (waitqueue_active)
+ * wake_up_locked_poll
+ * unlock ctx->qwh.lock
+ * eventfd_poll returns 0
+ *
+ * but the following, which would miss a wakeup, cannot happen:
+ *
+ * poll write
+ * ----------------- ------------
+ * count = ctx->count (INVALID!)
+ * lock ctx->qwh.lock
+ * ctx->count += n
+ * **waitqueue_active is false**
+ * **no wake_up_locked_poll!**
+ * unlock ctx->qwh.lock
+ * lock ctx->wqh.lock (in poll_wait)
+ * __add_wait_queue
+ * unlock ctx->wqh.lock
+ * eventfd_poll returns 0
+ */
+ count = READ_ONCE(ctx->count);
if (count > 0)
events |= POLLIN;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1e009cad8d5c..8a74a2a52e0f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -92,7 +92,12 @@
*/
/* Epoll private bits inside the event mask */
-#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
+#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
+
+#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
+
+#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
+ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
@@ -1002,6 +1007,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
+ int ewake = 0;
if ((unsigned long)key & POLLFREE) {
ep_pwq_from_wait(wait)->whead = NULL;
@@ -1066,8 +1072,25 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
- if (waitqueue_active(&ep->wq))
+ if (waitqueue_active(&ep->wq)) {
+ if ((epi->event.events & EPOLLEXCLUSIVE) &&
+ !((unsigned long)key & POLLFREE)) {
+ switch ((unsigned long)key & EPOLLINOUT_BITS) {
+ case POLLIN:
+ if (epi->event.events & POLLIN)
+ ewake = 1;
+ break;
+ case POLLOUT:
+ if (epi->event.events & POLLOUT)
+ ewake = 1;
+ break;
+ case 0:
+ ewake = 1;
+ break;
+ }
+ }
wake_up_locked(&ep->wq);
+ }
if (waitqueue_active(&ep->poll_wait))
pwake++;
@@ -1078,6 +1101,9 @@ out_unlock:
if (pwake)
ep_poll_safewake(&ep->poll_wait);
+ if (epi->event.events & EPOLLEXCLUSIVE)
+ return ewake;
+
return 1;
}
@@ -1095,7 +1121,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
- add_wait_queue(whead, &pwq->wait);
+ if (epi->event.events & EPOLLEXCLUSIVE)
+ add_wait_queue_exclusive(whead, &pwq->wait);
+ else
+ add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
@@ -1587,7 +1616,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
{
int res = 0, eavail, timed_out = 0;
unsigned long flags;
- long slack = 0;
+ u64 slack = 0;
wait_queue_t wait;
ktime_t expires, *to = NULL;
@@ -1862,6 +1891,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
goto error_tgt_fput;
/*
+ * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
+ * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
+ * Also, we do not currently supported nested exclusive wakeups.
+ */
+ if (epds.events & EPOLLEXCLUSIVE) {
+ if (op == EPOLL_CTL_MOD)
+ goto error_tgt_fput;
+ if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+ (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
+ goto error_tgt_fput;
+ }
+
+ /*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
@@ -1932,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
break;
case EPOLL_CTL_MOD:
if (epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_modify(ep, epi, &epds);
+ if (!(epi->event.events & EPOLLEXCLUSIVE)) {
+ epds.events |= POLLERR | POLLHUP;
+ error = ep_modify(ep, epi, &epds);
+ }
} else
error = -ENOENT;
break;
diff --git a/fs/exec.c b/fs/exec.c
index b06623a9347f..c4010b8207a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,6 +56,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
+#include <linux/vmalloc.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -119,7 +120,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
int error = PTR_ERR(tmp);
static const struct open_flags uselib_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
- .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+ .acc_mode = MAY_READ | MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
@@ -198,8 +199,12 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
return NULL;
}
#endif
- ret = get_user_pages(current, bprm->mm, pos,
- 1, write, 1, &page, NULL);
+ /*
+ * We are doing an exec(). 'current' is the process
+ * doing the exec and bprm->mm is the new process's mm.
+ */
+ ret = get_user_pages_remote(current, bprm->mm, pos, 1, write,
+ 1, &page, NULL);
if (ret <= 0)
return NULL;
@@ -763,7 +768,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
int err;
struct open_flags open_exec_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
- .acc_mode = MAY_EXEC | MAY_OPEN,
+ .acc_mode = MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
@@ -831,6 +836,97 @@ int kernel_read(struct file *file, loff_t offset,
EXPORT_SYMBOL(kernel_read);
+int kernel_read_file(struct file *file, void **buf, loff_t *size,
+ loff_t max_size, enum kernel_read_file_id id)
+{
+ loff_t i_size, pos;
+ ssize_t bytes = 0;
+ int ret;
+
+ if (!S_ISREG(file_inode(file)->i_mode) || max_size < 0)
+ return -EINVAL;
+
+ ret = security_kernel_read_file(file, id);
+ if (ret)
+ return ret;
+
+ i_size = i_size_read(file_inode(file));
+ if (max_size > 0 && i_size > max_size)
+ return -EFBIG;
+ if (i_size <= 0)
+ return -EINVAL;
+
+ *buf = vmalloc(i_size);
+ if (!*buf)
+ return -ENOMEM;
+
+ pos = 0;
+ while (pos < i_size) {
+ bytes = kernel_read(file, pos, (char *)(*buf) + pos,
+ i_size - pos);
+ if (bytes < 0) {
+ ret = bytes;
+ goto out;
+ }
+
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+
+ if (pos != i_size) {
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = security_kernel_post_read_file(file, *buf, i_size, id);
+ if (!ret)
+ *size = pos;
+
+out:
+ if (ret < 0) {
+ vfree(*buf);
+ *buf = NULL;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file);
+
+int kernel_read_file_from_path(char *path, void **buf, loff_t *size,
+ loff_t max_size, enum kernel_read_file_id id)
+{
+ struct file *file;
+ int ret;
+
+ if (!path || !*path)
+ return -EINVAL;
+
+ file = filp_open(path, O_RDONLY, 0);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ ret = kernel_read_file(file, buf, size, max_size, id);
+ fput(file);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_path);
+
+int kernel_read_file_from_fd(int fd, void **buf, loff_t *size, loff_t max_size,
+ enum kernel_read_file_id id)
+{
+ struct fd f = fdget(fd);
+ int ret = -EBADF;
+
+ if (!f.file)
+ goto out;
+
+ ret = kernel_read_file(f.file, buf, size, max_size, id);
+out:
+ fdput(f);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
+
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
@@ -1307,13 +1403,13 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
return;
/* Be careful if suid/sgid is set */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* reload atomically mode/uid/gid now that lock held */
mode = inode->i_mode;
uid = inode->i_uid;
gid = inode->i_gid;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/* We ignore suid/sgid if there are no mappings for them in the ns */
if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index e5bb2abf77f9..547b93cbea63 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -41,16 +41,16 @@ static inline unsigned exofs_chunk_size(struct inode *inode)
static inline void exofs_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
{
loff_t last_byte = inode->i_size;
- last_byte -= page_nr << PAGE_CACHE_SHIFT;
- if (last_byte > PAGE_CACHE_SIZE)
- last_byte = PAGE_CACHE_SIZE;
+ last_byte -= page_nr << PAGE_SHIFT;
+ if (last_byte > PAGE_SIZE)
+ last_byte = PAGE_SIZE;
return last_byte;
}
@@ -85,13 +85,13 @@ static void exofs_check_page(struct page *page)
unsigned chunk_size = exofs_chunk_size(dir);
char *kaddr = page_address(page);
unsigned offs, rec_len;
- unsigned limit = PAGE_CACHE_SIZE;
+ unsigned limit = PAGE_SIZE;
struct exofs_dir_entry *p;
char *error;
/* if the page is the last one in the directory */
- if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_CACHE_MASK;
+ if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+ limit = dir->i_size & ~PAGE_MASK;
if (limit & (chunk_size - 1))
goto Ebadsize;
if (!limit)
@@ -138,7 +138,7 @@ bad_entry:
EXOFS_ERR(
"ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
"offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
- dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
_LLU(le64_to_cpu(p->inode_no)),
rec_len, p->name_len);
goto fail;
@@ -147,7 +147,7 @@ Eend:
EXOFS_ERR("ERROR [exofs_check_page]: "
"entry in directory(0x%lx) spans the page boundary"
"offset=%lu, inode=0x%llx\n",
- dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
_LLU(le64_to_cpu(p->inode_no)));
fail:
SetPageChecked(page);
@@ -237,8 +237,8 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
{
loff_t pos = ctx->pos;
struct inode *inode = file_inode(file);
- unsigned int offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned int offset = pos & ~PAGE_MASK;
+ unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
int need_revalidate = (file->f_version != inode->i_version);
@@ -254,7 +254,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
if (IS_ERR(page)) {
EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
inode->i_ino);
- ctx->pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_SIZE - offset;
return PTR_ERR(page);
}
kaddr = page_address(page);
@@ -262,7 +262,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
if (offset) {
offset = exofs_validate_entry(kaddr, offset,
chunk_mask);
- ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_SHIFT) + offset;
}
file->f_version = inode->i_version;
need_revalidate = 0;
@@ -449,7 +449,7 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
kaddr = page_address(page);
dir_end = kaddr + exofs_last_byte(dir, n);
de = (struct exofs_dir_entry *)kaddr;
- kaddr += PAGE_CACHE_SIZE - reclen;
+ kaddr += PAGE_SIZE - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
name_len = 0;
@@ -602,7 +602,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
kunmap_atomic(kaddr);
err = exofs_commit_chunk(page, 0, chunk_size);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 906de66e8e7e..28645f0640f7 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -52,9 +52,9 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = sync_inode_metadata(filp->f_mapping->host, 1);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 73c64daa0f55..49e1bd00b4ec 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -317,7 +317,7 @@ static int read_exec(struct page_collect *pcol)
if (!pcol->ios) {
int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
- pcol->pg_first << PAGE_CACHE_SHIFT,
+ pcol->pg_first << PAGE_SHIFT,
pcol->length, &pcol->ios);
if (ret)
@@ -383,7 +383,7 @@ static int readpage_strip(void *data, struct page *page)
struct inode *inode = pcol->inode;
struct exofs_i_info *oi = exofs_i(inode);
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
size_t len;
int ret;
@@ -397,9 +397,9 @@ static int readpage_strip(void *data, struct page *page)
pcol->that_locked_page = page;
if (page->index < end_index)
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
else if (page->index == end_index)
- len = i_size & ~PAGE_CACHE_MASK;
+ len = i_size & ~PAGE_MASK;
else
len = 0;
@@ -442,8 +442,8 @@ try_again:
goto fail;
}
- if (len != PAGE_CACHE_SIZE)
- zero_user(page, len, PAGE_CACHE_SIZE - len);
+ if (len != PAGE_SIZE)
+ zero_user(page, len, PAGE_SIZE - len);
EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
inode->i_ino, page->index, len);
@@ -592,10 +592,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
}
unlock_page(page);
}
- if (PageDirty(page) || PageWriteback(page))
- *uptodate = true;
- else
- *uptodate = PageUptodate(page);
+ *uptodate = PageUptodate(page);
EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
return page;
} else {
@@ -612,7 +609,7 @@ static void __r4w_put_page(void *priv, struct page *page)
if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
EXOFS_DBGMSG2("index=0x%lx\n", page->index);
- page_cache_release(page);
+ put_page(page);
return;
}
EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
@@ -636,7 +633,7 @@ static int write_exec(struct page_collect *pcol)
BUG_ON(pcol->ios);
ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
- pcol->pg_first << PAGE_CACHE_SHIFT,
+ pcol->pg_first << PAGE_SHIFT,
pcol->length, &pcol->ios);
if (unlikely(ret))
goto err;
@@ -699,7 +696,7 @@ static int writepage_strip(struct page *page,
struct inode *inode = pcol->inode;
struct exofs_i_info *oi = exofs_i(inode);
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
size_t len;
int ret;
@@ -711,9 +708,9 @@ static int writepage_strip(struct page *page,
if (page->index < end_index)
/* in this case, the page is within the limits of the file */
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
else {
- len = i_size & ~PAGE_CACHE_MASK;
+ len = i_size & ~PAGE_MASK;
if (page->index > end_index || !len) {
/* in this case, the page is outside the limits
@@ -793,10 +790,10 @@ static int exofs_writepages(struct address_space *mapping,
long start, end, expected_pages;
int ret;
- start = wbc->range_start >> PAGE_CACHE_SHIFT;
+ start = wbc->range_start >> PAGE_SHIFT;
end = (wbc->range_end == LLONG_MAX) ?
start + mapping->nrpages :
- wbc->range_end >> PAGE_CACHE_SHIFT;
+ wbc->range_end >> PAGE_SHIFT;
if (start || end)
expected_pages = end - start + 1;
@@ -884,15 +881,15 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
}
/* read modify write */
- if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+ if (!PageUptodate(page) && (len != PAGE_SIZE)) {
loff_t i_size = i_size_read(mapping->host);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
size_t rlen;
if (page->index < end_index)
- rlen = PAGE_CACHE_SIZE;
+ rlen = PAGE_SIZE;
else if (page->index == end_index)
- rlen = i_size & ~PAGE_CACHE_MASK;
+ rlen = i_size & ~PAGE_MASK;
else
rlen = 0;
@@ -1227,6 +1224,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
inode->i_link = (char *)oi->i_data;
} else {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &exofs_aops;
}
} else {
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 09a6bb1ad63c..622a686bb08b 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -80,9 +80,6 @@ static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
struct inode *inode;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
inode = exofs_new_inode(dir, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
@@ -114,6 +111,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
if (l > sizeof(oi->i_data)) {
/* slow symlink */
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &exofs_aops;
memset(oi->i_data, 0, sizeof(oi->i_data));
@@ -294,11 +292,11 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
return err;
}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b795c567b5e1..6658a50530a0 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,8 +194,8 @@ static int init_inodecache(void)
{
exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
sizeof(struct exofs_i_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- exofs_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT, exofs_init_once);
if (exofs_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 714cd37a6ba3..c46f1a190b8d 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -124,10 +124,10 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
int err;
parent = ERR_PTR(-EACCES);
- mutex_lock(&dentry->d_inode->i_mutex);
+ inode_lock(dentry->d_inode);
if (mnt->mnt_sb->s_export_op->get_parent)
parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
if (IS_ERR(parent)) {
dprintk("%s: get_parent of %ld failed, err %d\n",
@@ -143,9 +143,9 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
if (err)
goto out_err;
dprintk("%s: found name: %s\n", __func__, nbuf);
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
tmp = lookup_one_len(nbuf, parent, strlen(nbuf));
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
if (IS_ERR(tmp)) {
dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
goto out_err;
@@ -503,10 +503,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
*/
err = exportfs_get_name(mnt, target_dir, nbuf, result);
if (!err) {
- mutex_lock(&target_dir->d_inode->i_mutex);
+ inode_lock(target_dir->d_inode);
nresult = lookup_one_len(nbuf, target_dir,
strlen(nbuf));
- mutex_unlock(&target_dir->d_inode->i_mutex);
+ inode_unlock(target_dir->d_inode);
if (!IS_ERR(nresult)) {
if (nresult->d_inode) {
dput(result);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 0c6638b40f21..7ff6fcfa685d 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -37,7 +37,7 @@ static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
{
unsigned len = le16_to_cpu(dlen);
-#if (PAGE_CACHE_SIZE >= 65536)
+#if (PAGE_SIZE >= 65536)
if (len == EXT2_MAX_REC_LEN)
return 1 << 16;
#endif
@@ -46,7 +46,7 @@ static inline unsigned ext2_rec_len_from_disk(__le16 dlen)
static inline __le16 ext2_rec_len_to_disk(unsigned len)
{
-#if (PAGE_CACHE_SIZE >= 65536)
+#if (PAGE_SIZE >= 65536)
if (len == (1 << 16))
return cpu_to_le16(EXT2_MAX_REC_LEN);
else
@@ -67,7 +67,7 @@ static inline unsigned ext2_chunk_size(struct inode *inode)
static inline void ext2_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -79,9 +79,9 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr)
{
unsigned last_byte = inode->i_size;
- last_byte -= page_nr << PAGE_CACHE_SHIFT;
- if (last_byte > PAGE_CACHE_SIZE)
- last_byte = PAGE_CACHE_SIZE;
+ last_byte -= page_nr << PAGE_SHIFT;
+ if (last_byte > PAGE_SIZE)
+ last_byte = PAGE_SIZE;
return last_byte;
}
@@ -118,12 +118,12 @@ static void ext2_check_page(struct page *page, int quiet)
char *kaddr = page_address(page);
u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
unsigned offs, rec_len;
- unsigned limit = PAGE_CACHE_SIZE;
+ unsigned limit = PAGE_SIZE;
ext2_dirent *p;
char *error;
- if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_CACHE_MASK;
+ if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+ limit = dir->i_size & ~PAGE_MASK;
if (limit & (chunk_size - 1))
goto Ebadsize;
if (!limit)
@@ -176,7 +176,7 @@ bad_entry:
if (!quiet)
ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
(unsigned long) le32_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
@@ -186,7 +186,7 @@ Eend:
ext2_error(sb, "ext2_check_page",
"entry in directory #%lu spans the page boundary"
"offset=%lu, inode=%lu",
- dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
(unsigned long) le32_to_cpu(p->inode));
}
fail:
@@ -287,8 +287,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
loff_t pos = ctx->pos;
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- unsigned int offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned int offset = pos & ~PAGE_MASK;
+ unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
unsigned char *types = NULL;
@@ -309,14 +309,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
ext2_error(sb, __func__,
"bad page in #%lu",
inode->i_ino);
- ctx->pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_SIZE - offset;
return PTR_ERR(page);
}
kaddr = page_address(page);
if (unlikely(need_revalidate)) {
if (offset) {
offset = ext2_validate_entry(kaddr, offset, chunk_mask);
- ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_SHIFT) + offset;
}
file->f_version = inode->i_version;
need_revalidate = 0;
@@ -406,7 +406,7 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
if (++n >= npages)
n = 0;
/* next page is past the blocks we've got */
- if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
ext2_error(dir->i_sb, __func__,
"dir %lu size %lld exceeds block count %llu",
dir->i_ino, dir->i_size,
@@ -511,7 +511,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
kaddr = page_address(page);
dir_end = kaddr + ext2_last_byte(dir, n);
de = (ext2_dirent *)kaddr;
- kaddr += PAGE_CACHE_SIZE - reclen;
+ kaddr += PAGE_SIZE - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
/* We hit i_size */
@@ -655,7 +655,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
kunmap_atomic(kaddr);
err = ext2_commit_chunk(page, 0, chunk_size);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 8d15febd0aa3..170939f379d7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -61,6 +61,8 @@ struct ext2_block_alloc_info {
#define rsv_start rsv_window._rsv_start
#define rsv_end rsv_window._rsv_end
+struct mb_cache;
+
/*
* second extended-fs super-block data in memory
*/
@@ -111,6 +113,7 @@ struct ext2_sb_info {
* of the mount options.
*/
spinlock_t s_lock;
+ struct mb_cache *s_mb_cache;
};
static inline spinlock_t *
@@ -684,6 +687,9 @@ struct ext2_inode_info {
struct rw_semaphore xattr_sem;
#endif
rwlock_t i_meta_lock;
+#ifdef CONFIG_FS_DAX
+ struct rw_semaphore dax_sem;
+#endif
/*
* truncate_mutex is for serialising ext2_truncate() against
@@ -699,6 +705,14 @@ struct ext2_inode_info {
#endif
};
+#ifdef CONFIG_FS_DAX
+#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem)
+#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem)
+#else
+#define dax_sem_down_write(ext2_inode)
+#define dax_sem_up_write(ext2_inode)
+#endif
+
/*
* Inode dynamic state flags
*/
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 1982c3f11aec..c1400b109805 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -27,27 +27,88 @@
#include "acl.h"
#ifdef CONFIG_FS_DAX
+/*
+ * The lock ordering for ext2 DAX fault paths is:
+ *
+ * mmap_sem (MM)
+ * sb_start_pagefault (vfs, freeze)
+ * ext2_inode_info->dax_sem
+ * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
+ * ext2_inode_info->truncate_mutex
+ *
+ * The default page_lock and i_size verification done by non-DAX fault paths
+ * is sufficient because ext2 doesn't support hole punching.
+ */
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+ down_read(&ei->dax_sem);
+
+ ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags)
{
- return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+ down_read(&ei->dax_sem);
+
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
-static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ loff_t size;
+ int ret;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ down_read(&ei->dax_sem);
+
+ /* check that the faulting page hasn't raced with truncate */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+ else
+ ret = dax_pfn_mkwrite(vma, vmf);
+
+ up_read(&ei->dax_sem);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
.pmd_fault = ext2_dax_pmd_fault,
- .page_mkwrite = ext2_dax_mkwrite,
- .pfn_mkwrite = dax_pfn_mkwrite,
+ .page_mkwrite = ext2_dax_fault,
+ .pfn_mkwrite = ext2_dax_pfn_mkwrite,
};
static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c60a248c640c..6bd58e6ff038 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -737,8 +737,10 @@ static int ext2_get_blocks(struct inode *inode,
* so that it's not found by another thread before it's
* initialised
*/
- err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
- 1 << inode->i_blkbits);
+ err = dax_clear_sectors(inode->i_sb->s_bdev,
+ le32_to_cpu(chain[depth-1].key) <<
+ (inode->i_blkbits - 9),
+ 1 << inode->i_blkbits);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
@@ -874,6 +876,14 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
static int
ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
+#ifdef CONFIG_FS_DAX
+ if (dax_mapping(mapping)) {
+ return dax_writeback_mapping_range(mapping,
+ mapping->host->i_sb->s_bdev,
+ wbc);
+ }
+#endif
+
return mpage_writepages(mapping, wbc, ext2_get_block);
}
@@ -1085,6 +1095,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
ext2_free_data(inode, p, q);
}
+/* dax_sem must be held when calling this function */
static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
{
__le32 *i_data = EXT2_I(inode)->i_data;
@@ -1100,6 +1111,10 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
blocksize = inode->i_sb->s_blocksize;
iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
+#ifdef CONFIG_FS_DAX
+ WARN_ON(!rwsem_is_locked(&ei->dax_sem));
+#endif
+
n = ext2_block_to_path(inode, iblock, offsets, NULL);
if (n == 0)
return;
@@ -1185,7 +1200,10 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
return;
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;
+
+ dax_sem_down_write(EXT2_I(inode));
__ext2_truncate_blocks(inode, offset);
+ dax_sem_up_write(EXT2_I(inode));
}
static int ext2_setsize(struct inode *inode, loff_t newsize)
@@ -1213,8 +1231,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
if (error)
return error;
+ dax_sem_down_write(EXT2_I(inode));
truncate_setsize(inode, newsize);
__ext2_truncate_blocks(inode, newsize);
+ dax_sem_up_write(EXT2_I(inode));
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
if (inode_needs_sync(inode)) {
@@ -1286,7 +1306,7 @@ void ext2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_NOATIME;
if (flags & EXT2_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
- if (test_opt(inode->i_sb, DAX))
+ if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
inode->i_flags |= S_DAX;
}
@@ -1410,6 +1430,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
sizeof(ei->i_data) - 1);
} else {
inode->i_op = &ext2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 5d46c09863f0..b386af2e45f4 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -51,10 +51,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags = ext2_mask_flags(inode->i_mode, flags);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ret = -EPERM;
goto setflags_out;
}
@@ -68,7 +68,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
*/
if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ret = -EPERM;
goto setflags_out;
}
@@ -80,7 +80,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ext2_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME_SEC;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mark_inode_dirty(inode);
setflags_out:
@@ -102,10 +102,10 @@ setflags_out:
goto setversion_out;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
inode->i_ctime = CURRENT_TIME_SEC;
inode->i_generation = generation;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mark_inode_dirty(inode);
setversion_out:
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index b4841e3066a5..d34843925b23 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -143,9 +143,6 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode,
struct inode * inode;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
err = dquot_initialize(dir);
if (err)
return err;
@@ -186,6 +183,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
if (l > sizeof (EXT2_I(inode)->i_data)) {
/* slow symlink */
inode->i_op = &ext2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
else
@@ -400,7 +398,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
else {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
inode_dec_link_count(old_dir);
}
@@ -410,11 +408,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
return err;
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 900e19cf9ef6..b78caf25f746 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb)
dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
- ext2_xattr_put_super(sb);
+ if (sbi->s_mb_cache) {
+ ext2_xattr_destroy_cache(sbi->s_mb_cache);
+ sbi->s_mb_cache = NULL;
+ }
if (!(sb->s_flags & MS_RDONLY)) {
struct ext2_super_block *es = sbi->s_es;
@@ -192,6 +195,9 @@ static void init_once(void *foo)
init_rwsem(&ei->xattr_sem);
#endif
mutex_init(&ei->truncate_mutex);
+#ifdef CONFIG_FS_DAX
+ init_rwsem(&ei->dax_sem);
+#endif
inode_init_once(&ei->vfs_inode);
}
@@ -200,7 +206,7 @@ static int __init init_inodecache(void)
ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
sizeof(struct ext2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ext2_inode_cachep == NULL)
return -ENOMEM;
@@ -566,6 +572,8 @@ static int parse_options(char *options, struct super_block *sb)
/* Fall through */
case Opt_dax:
#ifdef CONFIG_FS_DAX
+ ext2_msg(sb, KERN_WARNING,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
set_opt(sbi->s_mount_opt, DAX);
#else
ext2_msg(sb, KERN_INFO, "dax option not supported");
@@ -1099,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
ext2_msg(sb, KERN_ERR, "error: insufficient memory");
goto failed_mount3;
}
+
+#ifdef CONFIG_EXT2_FS_XATTR
+ sbi->s_mb_cache = ext2_xattr_create_cache();
+ if (!sbi->s_mb_cache) {
+ ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ goto failed_mount3;
+ }
+#endif
/*
* set up enough so that it can read an inode
*/
@@ -1144,6 +1160,8 @@ cantfind_ext2:
sb->s_id);
goto failed_mount;
failed_mount3:
+ if (sbi->s_mb_cache)
+ ext2_xattr_destroy_cache(sbi->s_mb_cache);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1550,20 +1568,17 @@ MODULE_ALIAS_FS("ext2");
static int __init init_ext2_fs(void)
{
- int err = init_ext2_xattr();
- if (err)
- return err;
+ int err;
+
err = init_inodecache();
if (err)
- goto out1;
+ return err;
err = register_filesystem(&ext2_fs_type);
if (err)
goto out;
return 0;
out:
destroy_inodecache();
-out1:
- exit_ext2_xattr();
return err;
}
@@ -1571,7 +1586,6 @@ static void __exit exit_ext2_fs(void)
{
unregister_filesystem(&ext2_fs_type);
destroy_inodecache();
- exit_ext2_xattr();
}
MODULE_AUTHOR("Remy Card and others");
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index ae17179f3810..3495d8ae4b33 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -22,8 +22,7 @@
const struct inode_operations ext2_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
@@ -35,7 +34,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
const struct inode_operations ext2_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ext2_setattr,
#ifdef CONFIG_EXT2_FS_XATTR
.setxattr = generic_setxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 0b6bfd3a398b..1a5e3bff0b63 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -77,10 +77,8 @@
printk("\n"); \
} while (0)
# define ea_bdebug(bh, f...) do { \
- char b[BDEVNAME_SIZE]; \
- printk(KERN_DEBUG "block %s:%lu: ", \
- bdevname(bh->b_bdev, b), \
- (unsigned long) bh->b_blocknr); \
+ printk(KERN_DEBUG "block %pg:%lu: ", \
+ bh->b_bdev, (unsigned long) bh->b_blocknr); \
printk(f); \
printk("\n"); \
} while (0)
@@ -92,14 +90,12 @@
static int ext2_xattr_set2(struct inode *, struct buffer_head *,
struct ext2_xattr_header *);
-static int ext2_xattr_cache_insert(struct buffer_head *);
+static int ext2_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
static struct buffer_head *ext2_xattr_cache_find(struct inode *,
struct ext2_xattr_header *);
static void ext2_xattr_rehash(struct ext2_xattr_header *,
struct ext2_xattr_entry *);
-static struct mb_cache *ext2_xattr_cache;
-
static const struct xattr_handler *ext2_xattr_handler_map[] = {
[EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -154,6 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
size_t name_len, size;
char *end;
int error;
+ struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
@@ -198,7 +195,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
goto found;
entry = next;
}
- if (ext2_xattr_cache_insert(bh))
+ if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
ea_idebug(inode, "cache insert failed");
error = -ENODATA;
goto cleanup;
@@ -211,7 +208,7 @@ found:
le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
goto bad_block;
- if (ext2_xattr_cache_insert(bh))
+ if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
ea_idebug(inode, "cache insert failed");
if (buffer) {
error = -ERANGE;
@@ -249,6 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
char *end;
size_t rest = buffer_size;
int error;
+ struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
@@ -283,7 +281,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
goto bad_block;
entry = next;
}
- if (ext2_xattr_cache_insert(bh))
+ if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
ea_idebug(inode, "cache insert failed");
/* list the attribute names */
@@ -292,17 +290,21 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
const struct xattr_handler *handler =
ext2_xattr_handler(entry->e_name_index);
- if (handler) {
- size_t size = handler->list(dentry, buffer, rest,
- entry->e_name,
- entry->e_name_len,
- handler->flags);
+ if (handler && (!handler->list || handler->list(dentry))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_len = strlen(prefix);
+ size_t size = prefix_len + entry->e_name_len + 1;
+
if (buffer) {
if (size > rest) {
error = -ERANGE;
goto cleanup;
}
- buffer += size;
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
rest -= size;
}
@@ -481,22 +483,23 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
/* Here we know that we can set the new attribute. */
if (header) {
- struct mb_cache_entry *ce;
-
/* assert(header == HDR(bh)); */
- ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
- bh->b_blocknr);
lock_buffer(bh);
if (header->h_refcount == cpu_to_le32(1)) {
+ __u32 hash = le32_to_cpu(header->h_hash);
+
ea_bdebug(bh, "modifying in-place");
- if (ce)
- mb_cache_entry_free(ce);
+ /*
+ * This must happen under buffer lock for
+ * ext2_xattr_set2() to reliably detect modified block
+ */
+ mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
+ hash, bh->b_blocknr);
+
/* keep the buffer locked while modifying it. */
} else {
int offset;
- if (ce)
- mb_cache_entry_release(ce);
unlock_buffer(bh);
ea_bdebug(bh, "cloning");
header = kmalloc(bh->b_size, GFP_KERNEL);
@@ -624,6 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
struct super_block *sb = inode->i_sb;
struct buffer_head *new_bh = NULL;
int error;
+ struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache;
if (header) {
new_bh = ext2_xattr_cache_find(inode, header);
@@ -651,7 +655,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
don't need to change the reference count. */
new_bh = old_bh;
get_bh(new_bh);
- ext2_xattr_cache_insert(new_bh);
+ ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
} else {
/* We need to allocate a new block */
ext2_fsblk_t goal = ext2_group_first_block_no(sb,
@@ -672,7 +676,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
memcpy(new_bh->b_data, header, new_bh->b_size);
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
- ext2_xattr_cache_insert(new_bh);
+ ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
ext2_xattr_update_super_block(sb);
}
@@ -705,19 +709,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
error = 0;
if (old_bh && old_bh != new_bh) {
- struct mb_cache_entry *ce;
-
/*
* If there was an old block and we are no longer using it,
* release the old block.
*/
- ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
- old_bh->b_blocknr);
lock_buffer(old_bh);
if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
+ __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
+
+ /*
+ * This must happen under buffer lock for
+ * ext2_xattr_set2() to reliably detect freed block
+ */
+ mb_cache_entry_delete_block(ext2_mb_cache,
+ hash, old_bh->b_blocknr);
/* Free the old block. */
- if (ce)
- mb_cache_entry_free(ce);
ea_bdebug(old_bh, "freeing");
ext2_free_blocks(inode, old_bh->b_blocknr, 1);
mark_inode_dirty(inode);
@@ -728,8 +734,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
} else {
/* Decrement the refcount only. */
le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
- if (ce)
- mb_cache_entry_release(ce);
dquot_free_block_nodirty(inode, 1);
mark_inode_dirty(inode);
mark_buffer_dirty(old_bh);
@@ -755,7 +759,6 @@ void
ext2_xattr_delete_inode(struct inode *inode)
{
struct buffer_head *bh = NULL;
- struct mb_cache_entry *ce;
down_write(&EXT2_I(inode)->xattr_sem);
if (!EXT2_I(inode)->i_file_acl)
@@ -775,19 +778,22 @@ ext2_xattr_delete_inode(struct inode *inode)
EXT2_I(inode)->i_file_acl);
goto cleanup;
}
- ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
lock_buffer(bh);
if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
- if (ce)
- mb_cache_entry_free(ce);
+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
+
+ /*
+ * This must happen under buffer lock for ext2_xattr_set2() to
+ * reliably detect freed block
+ */
+ mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
+ hash, bh->b_blocknr);
ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
get_bh(bh);
bforget(bh);
unlock_buffer(bh);
} else {
le32_add_cpu(&HDR(bh)->h_refcount, -1);
- if (ce)
- mb_cache_entry_release(ce);
ea_bdebug(bh, "refcount now=%d",
le32_to_cpu(HDR(bh)->h_refcount));
unlock_buffer(bh);
@@ -804,18 +810,6 @@ cleanup:
}
/*
- * ext2_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext2_xattr_put_super(struct super_block *sb)
-{
- mb_cache_shrink(sb->s_bdev);
-}
-
-
-/*
* ext2_xattr_cache_insert()
*
* Create a new entry in the extended attribute cache, and insert
@@ -824,28 +818,20 @@ ext2_xattr_put_super(struct super_block *sb)
* Returns 0, or a negative error number on failure.
*/
static int
-ext2_xattr_cache_insert(struct buffer_head *bh)
+ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
{
__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
- struct mb_cache_entry *ce;
int error;
- ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS);
- if (!ce)
- return -ENOMEM;
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+ error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
if (error) {
- mb_cache_entry_free(ce);
if (error == -EBUSY) {
ea_bdebug(bh, "already in cache (%d cache entries)",
atomic_read(&ext2_xattr_cache->c_entry_count));
error = 0;
}
- } else {
- ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
- atomic_read(&ext2_xattr_cache->c_entry_count));
- mb_cache_entry_release(ce);
- }
+ } else
+ ea_bdebug(bh, "inserting [%x]", (int)hash);
return error;
}
@@ -902,22 +888,16 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
{
__u32 hash = le32_to_cpu(header->h_hash);
struct mb_cache_entry *ce;
+ struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev,
- hash);
+ ce = mb_cache_entry_find_first(ext2_mb_cache, hash);
while (ce) {
struct buffer_head *bh;
- if (IS_ERR(ce)) {
- if (PTR_ERR(ce) == -EAGAIN)
- goto again;
- break;
- }
-
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) {
ext2_error(inode->i_sb, "ext2_xattr_cache_find",
@@ -925,7 +905,21 @@ again:
inode->i_ino, (unsigned long) ce->e_block);
} else {
lock_buffer(bh);
- if (le32_to_cpu(HDR(bh)->h_refcount) >
+ /*
+ * We have to be careful about races with freeing or
+ * rehashing of xattr block. Once we hold buffer lock
+ * xattr block's state is stable so we can check
+ * whether the block got freed / rehashed or not.
+ * Since we unhash mbcache entry under buffer lock when
+ * freeing / rehashing xattr block, checking whether
+ * entry is still hashed is reliable.
+ */
+ if (hlist_bl_unhashed(&ce->e_hash_list)) {
+ mb_cache_entry_put(ext2_mb_cache, ce);
+ unlock_buffer(bh);
+ brelse(bh);
+ goto again;
+ } else if (le32_to_cpu(HDR(bh)->h_refcount) >
EXT2_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %ld refcount %d>%d",
(unsigned long) ce->e_block,
@@ -934,13 +928,14 @@ again:
} else if (!ext2_xattr_cmp(header, HDR(bh))) {
ea_bdebug(bh, "b_count=%d",
atomic_read(&(bh->b_count)));
- mb_cache_entry_release(ce);
+ mb_cache_entry_touch(ext2_mb_cache, ce);
+ mb_cache_entry_put(ext2_mb_cache, ce);
return bh;
}
unlock_buffer(bh);
brelse(bh);
}
- ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_next(ext2_mb_cache, ce);
}
return NULL;
}
@@ -1013,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
#undef BLOCK_HASH_SHIFT
-int __init
-init_ext2_xattr(void)
+#define HASH_BUCKET_BITS 10
+
+struct mb_cache *ext2_xattr_create_cache(void)
{
- ext2_xattr_cache = mb_cache_create("ext2_xattr", 6);
- if (!ext2_xattr_cache)
- return -ENOMEM;
- return 0;
+ return mb_cache_create(HASH_BUCKET_BITS);
}
-void
-exit_ext2_xattr(void)
+void ext2_xattr_destroy_cache(struct mb_cache *cache)
{
- mb_cache_destroy(ext2_xattr_cache);
+ if (cache)
+ mb_cache_destroy(cache);
}
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 60edf298644e..6f82ab1b00ca 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -53,6 +53,8 @@ struct ext2_xattr_entry {
#define EXT2_XATTR_SIZE(size) \
(((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
+struct mb_cache;
+
# ifdef CONFIG_EXT2_FS_XATTR
extern const struct xattr_handler ext2_xattr_user_handler;
@@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
extern void ext2_xattr_delete_inode(struct inode *);
-extern void ext2_xattr_put_super(struct super_block *);
-extern int init_ext2_xattr(void);
-extern void exit_ext2_xattr(void);
+extern struct mb_cache *ext2_xattr_create_cache(void);
+extern void ext2_xattr_destroy_cache(struct mb_cache *cache);
extern const struct xattr_handler *ext2_xattr_handlers[];
@@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode)
{
}
-static inline void
-ext2_xattr_put_super(struct super_block *sb)
-{
-}
-
-static inline int
-init_ext2_xattr(void)
-{
- return 0;
-}
-
-static inline void
-exit_ext2_xattr(void)
+static inline void ext2_xattr_destroy_cache(struct mb_cache *cache)
{
}
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 702fc6840246..ba97f243b050 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -7,37 +7,20 @@
#include <linux/security.h>
#include "xattr.h"
-static size_t
-ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int
-ext2_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
buffer, size);
}
static int
-ext2_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
value, size, flags);
}
@@ -68,7 +51,6 @@ ext2_init_security(struct inode *inode, struct inode *dir,
const struct xattr_handler ext2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ext2_xattr_security_list,
.get = ext2_xattr_security_get,
.set = ext2_xattr_security_set,
};
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 42b6e9874bcc..2c94d1930626 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -8,40 +8,26 @@
#include "ext2.h"
#include "xattr.h"
-static size_t
-ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext2_xattr_trusted_list(struct dentry *dentry)
{
- const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return capable(CAP_SYS_ADMIN);
}
static int
-ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
buffer, size);
}
static int
-ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
value, size, flags);
}
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index ecdc4605192c..72a2a96d677f 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -10,30 +10,17 @@
#include "ext2.h"
#include "xattr.h"
-static size_t
-ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext2_xattr_user_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return test_opt(dentry->d_sb, XATTR_USER);
}
static int
-ext2_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER,
@@ -41,11 +28,10 @@ ext2_xattr_user_get(struct dentry *dentry, const char *name,
}
static int
-ext2_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index ec0668a60678..fe1f50fe764f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -191,7 +191,6 @@ static int ext4_init_block_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -442,14 +441,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
- if (err)
+ if (err) {
+ ext4_error(sb, "Failed to init block bitmap for group "
+ "%u: %d", block_group, err);
goto out;
+ }
goto verify;
}
ext4_unlock_group(sb, block_group);
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index af06830bfc00..6a6c27373b54 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -18,11 +18,9 @@
* Special Publication 800-38E and IEEE P1619/D16.
*/
-#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/skcipher.h>
#include <keys/user-type.h>
#include <keys/encrypted-type.h>
-#include <linux/crypto.h>
#include <linux/ecryptfs.h>
#include <linux/gfp.h>
#include <linux/kernel.h>
@@ -34,6 +32,7 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <linux/spinlock_types.h>
+#include <linux/namei.h>
#include "ext4_extents.h"
#include "xattr.h"
@@ -93,7 +92,8 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx)
* Return: An allocated and initialized encryption context on success; error
* value or NULL otherwise.
*/
-struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode)
+struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode,
+ gfp_t gfp_flags)
{
struct ext4_crypto_ctx *ctx = NULL;
int res = 0;
@@ -120,7 +120,7 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode)
list_del(&ctx->free_list);
spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
if (!ctx) {
- ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS);
+ ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, gfp_flags);
if (!ctx) {
res = -ENOMEM;
goto out;
@@ -257,25 +257,26 @@ static int ext4_page_crypto(struct inode *inode,
ext4_direction_t rw,
pgoff_t index,
struct page *src_page,
- struct page *dest_page)
+ struct page *dest_page,
+ gfp_t gfp_flags)
{
u8 xts_tweak[EXT4_XTS_TWEAK_SIZE];
- struct ablkcipher_request *req = NULL;
+ struct skcipher_request *req = NULL;
DECLARE_EXT4_COMPLETION_RESULT(ecr);
struct scatterlist dst, src;
struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, gfp_flags);
if (!req) {
printk_ratelimited(KERN_ERR
"%s: crypto_request_alloc() failed\n",
__func__);
return -ENOMEM;
}
- ablkcipher_request_set_callback(
+ skcipher_request_set_callback(
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
ext4_crypt_complete, &ecr);
@@ -285,33 +286,34 @@ static int ext4_page_crypto(struct inode *inode,
EXT4_XTS_TWEAK_SIZE - sizeof(index));
sg_init_table(&dst, 1);
- sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+ sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
sg_init_table(&src, 1);
- sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
- ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
- xts_tweak);
+ sg_set_page(&src, src_page, PAGE_SIZE, 0);
+ skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE,
+ xts_tweak);
if (rw == EXT4_DECRYPT)
- res = crypto_ablkcipher_decrypt(req);
+ res = crypto_skcipher_decrypt(req);
else
- res = crypto_ablkcipher_encrypt(req);
+ res = crypto_skcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
wait_for_completion(&ecr.completion);
res = ecr.res;
}
- ablkcipher_request_free(req);
+ skcipher_request_free(req);
if (res) {
printk_ratelimited(
KERN_ERR
- "%s: crypto_ablkcipher_encrypt() returned %d\n",
+ "%s: crypto_skcipher_encrypt() returned %d\n",
__func__, res);
return res;
}
return 0;
}
-static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx)
+static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx,
+ gfp_t gfp_flags)
{
- ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT);
+ ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, gfp_flags);
if (ctx->w.bounce_page == NULL)
return ERR_PTR(-ENOMEM);
ctx->flags |= EXT4_WRITE_PATH_FL;
@@ -334,7 +336,8 @@ static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx)
* error value or NULL.
*/
struct page *ext4_encrypt(struct inode *inode,
- struct page *plaintext_page)
+ struct page *plaintext_page,
+ gfp_t gfp_flags)
{
struct ext4_crypto_ctx *ctx;
struct page *ciphertext_page = NULL;
@@ -342,17 +345,17 @@ struct page *ext4_encrypt(struct inode *inode,
BUG_ON(!PageLocked(plaintext_page));
- ctx = ext4_get_crypto_ctx(inode);
+ ctx = ext4_get_crypto_ctx(inode, gfp_flags);
if (IS_ERR(ctx))
return (struct page *) ctx;
/* The encryption operation will require a bounce page. */
- ciphertext_page = alloc_bounce_page(ctx);
+ ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
if (IS_ERR(ciphertext_page))
goto errout;
ctx->w.control_page = plaintext_page;
err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index,
- plaintext_page, ciphertext_page);
+ plaintext_page, ciphertext_page, gfp_flags);
if (err) {
ciphertext_page = ERR_PTR(err);
errout:
@@ -380,18 +383,16 @@ int ext4_decrypt(struct page *page)
{
BUG_ON(!PageLocked(page));
- return ext4_page_crypto(page->mapping->host,
- EXT4_DECRYPT, page->index, page, page);
+ return ext4_page_crypto(page->mapping->host, EXT4_DECRYPT,
+ page->index, page, page, GFP_NOFS);
}
-int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
+int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
+ ext4_fsblk_t pblk, ext4_lblk_t len)
{
struct ext4_crypto_ctx *ctx;
struct page *ciphertext_page = NULL;
struct bio *bio;
- ext4_lblk_t lblk = ex->ee_block;
- ext4_fsblk_t pblk = ext4_ext_pblock(ex);
- unsigned int len = ext4_ext_get_actual_len(ex);
int ret, err = 0;
#if 0
@@ -400,13 +401,13 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
(unsigned long) inode->i_ino, lblk, len);
#endif
- BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+ BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
- ctx = ext4_get_crypto_ctx(inode);
+ ctx = ext4_get_crypto_ctx(inode, GFP_NOFS);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- ciphertext_page = alloc_bounce_page(ctx);
+ ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT);
if (IS_ERR(ciphertext_page)) {
err = PTR_ERR(ciphertext_page);
goto errout;
@@ -414,11 +415,12 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
while (len--) {
err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk,
- ZERO_PAGE(0), ciphertext_page);
+ ZERO_PAGE(0), ciphertext_page,
+ GFP_NOFS);
if (err)
goto errout;
- bio = bio_alloc(GFP_KERNEL, 1);
+ bio = bio_alloc(GFP_NOWAIT, 1);
if (!bio) {
err = -ENOMEM;
goto errout;
@@ -469,3 +471,66 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
return size;
return 0;
}
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct dentry *dir;
+ struct ext4_crypt_info *ci;
+ int dir_has_key, cached_with_key;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ dir = dget_parent(dentry);
+ if (!ext4_encrypted_inode(d_inode(dir))) {
+ dput(dir);
+ return 0;
+ }
+ ci = EXT4_I(d_inode(dir))->i_crypt_info;
+ if (ci && ci->ci_keyring_key &&
+ (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+ (1 << KEY_FLAG_REVOKED) |
+ (1 << KEY_FLAG_DEAD))))
+ ci = NULL;
+
+ /* this should eventually be an flag in d_flags */
+ cached_with_key = dentry->d_fsdata != NULL;
+ dir_has_key = (ci != NULL);
+ dput(dir);
+
+ /*
+ * If the dentry was cached without the key, and it is a
+ * negative dentry, it might be a valid name. We can't check
+ * if the key has since been made available due to locking
+ * reasons, so we fail the validation so ext4_lookup() can do
+ * this check.
+ *
+ * We also fail the validation if the dentry was created with
+ * the key present, but we no longer have the key, or vice versa.
+ */
+ if ((!cached_with_key && d_is_negative(dentry)) ||
+ (!cached_with_key && dir_has_key) ||
+ (cached_with_key && !dir_has_key)) {
+#if 0 /* Revalidation debug */
+ char buf[80];
+ char *cp = simple_dname(dentry, buf, sizeof(buf));
+
+ if (IS_ERR(cp))
+ cp = (char *) "???";
+ pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata,
+ cached_with_key, d_is_negative(dentry),
+ dir_has_key);
+#endif
+ return 0;
+ }
+ return 1;
+}
+
+const struct dentry_operations ext4_encrypted_d_ops = {
+ .d_revalidate = ext4_d_revalidate,
+};
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 2fbef8a14760..1a2f360405db 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -11,11 +11,9 @@
*
*/
-#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/skcipher.h>
#include <keys/encrypted-type.h>
#include <keys/user-type.h>
-#include <linux/crypto.h>
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/key.h>
@@ -65,10 +63,10 @@ static int ext4_fname_encrypt(struct inode *inode,
struct ext4_str *oname)
{
u32 ciphertext_len;
- struct ablkcipher_request *req = NULL;
+ struct skcipher_request *req = NULL;
DECLARE_EXT4_COMPLETION_RESULT(ecr);
struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
char iv[EXT4_CRYPTO_BLOCK_SIZE];
struct scatterlist src_sg, dst_sg;
@@ -95,14 +93,14 @@ static int ext4_fname_encrypt(struct inode *inode,
}
/* Allocate request */
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
printk_ratelimited(
KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
kfree(alloc_buf);
return -ENOMEM;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
ext4_dir_crypt_complete, &ecr);
@@ -117,14 +115,14 @@ static int ext4_fname_encrypt(struct inode *inode,
/* Create encryption request */
sg_init_one(&src_sg, workbuf, ciphertext_len);
sg_init_one(&dst_sg, oname->name, ciphertext_len);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
- res = crypto_ablkcipher_encrypt(req);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+ res = crypto_skcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
wait_for_completion(&ecr.completion);
res = ecr.res;
}
kfree(alloc_buf);
- ablkcipher_request_free(req);
+ skcipher_request_free(req);
if (res < 0) {
printk_ratelimited(
KERN_ERR "%s: Error (error code %d)\n", __func__, res);
@@ -145,11 +143,11 @@ static int ext4_fname_decrypt(struct inode *inode,
struct ext4_str *oname)
{
struct ext4_str tmp_in[2], tmp_out[1];
- struct ablkcipher_request *req = NULL;
+ struct skcipher_request *req = NULL;
DECLARE_EXT4_COMPLETION_RESULT(ecr);
struct scatterlist src_sg, dst_sg;
struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = ci->ci_ctfm;
int res = 0;
char iv[EXT4_CRYPTO_BLOCK_SIZE];
unsigned lim = max_name_len(inode);
@@ -162,13 +160,13 @@ static int ext4_fname_decrypt(struct inode *inode,
tmp_out[0].name = oname->name;
/* Allocate request */
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
printk_ratelimited(
KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
return -ENOMEM;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
ext4_dir_crypt_complete, &ecr);
@@ -178,13 +176,13 @@ static int ext4_fname_decrypt(struct inode *inode,
/* Create encryption request */
sg_init_one(&src_sg, iname->name, iname->len);
sg_init_one(&dst_sg, oname->name, oname->len);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
- res = crypto_ablkcipher_decrypt(req);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+ res = crypto_skcipher_decrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
wait_for_completion(&ecr.completion);
res = ecr.res;
}
- ablkcipher_request_free(req);
+ skcipher_request_free(req);
if (res < 0) {
printk_ratelimited(
KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n",
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index c5882b36e558..0129d688d1f7 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -8,6 +8,7 @@
* Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
*/
+#include <crypto/skcipher.h>
#include <keys/encrypted-type.h>
#include <keys/user-type.h>
#include <linux/random.h>
@@ -41,45 +42,42 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
{
int res = 0;
- struct ablkcipher_request *req = NULL;
+ struct skcipher_request *req = NULL;
DECLARE_EXT4_COMPLETION_RESULT(ecr);
struct scatterlist src_sg, dst_sg;
- struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
- 0);
+ struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
if (IS_ERR(tfm)) {
res = PTR_ERR(tfm);
tfm = NULL;
goto out;
}
- crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+ crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+ req = skcipher_request_alloc(tfm, GFP_NOFS);
if (!req) {
res = -ENOMEM;
goto out;
}
- ablkcipher_request_set_callback(req,
+ skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
derive_crypt_complete, &ecr);
- res = crypto_ablkcipher_setkey(tfm, deriving_key,
- EXT4_AES_128_ECB_KEY_SIZE);
+ res = crypto_skcipher_setkey(tfm, deriving_key,
+ EXT4_AES_128_ECB_KEY_SIZE);
if (res < 0)
goto out;
sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE);
sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
- EXT4_AES_256_XTS_KEY_SIZE, NULL);
- res = crypto_ablkcipher_encrypt(req);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ EXT4_AES_256_XTS_KEY_SIZE, NULL);
+ res = crypto_skcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
wait_for_completion(&ecr.completion);
res = ecr.res;
}
out:
- if (req)
- ablkcipher_request_free(req);
- if (tfm)
- crypto_free_ablkcipher(tfm);
+ skcipher_request_free(req);
+ crypto_free_skcipher(tfm);
return res;
}
@@ -90,7 +88,7 @@ void ext4_free_crypt_info(struct ext4_crypt_info *ci)
if (ci->ci_keyring_key)
key_put(ci->ci_keyring_key);
- crypto_free_ablkcipher(ci->ci_ctfm);
+ crypto_free_skcipher(ci->ci_ctfm);
kmem_cache_free(ext4_crypt_info_cachep, ci);
}
@@ -122,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode)
struct ext4_encryption_context ctx;
const struct user_key_payload *ukp;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct crypto_ablkcipher *ctfm;
+ struct crypto_skcipher *ctfm;
const char *cipher_str;
char raw_key[EXT4_MAX_KEY_SIZE];
char mode;
@@ -213,9 +211,11 @@ retry:
res = -ENOKEY;
goto out;
}
+ down_read(&keyring_key->sem);
ukp = user_key_payload(keyring_key);
if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
res = -EINVAL;
+ up_read(&keyring_key->sem);
goto out;
}
master_key = (struct ext4_encryption_key *)ukp->data;
@@ -226,14 +226,16 @@ retry:
"ext4: key size incorrect: %d\n",
master_key->size);
res = -ENOKEY;
+ up_read(&keyring_key->sem);
goto out;
}
res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
raw_key);
+ up_read(&keyring_key->sem);
if (res)
goto out;
got_key:
- ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
+ ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
if (!ctfm || IS_ERR(ctfm)) {
res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
printk(KERN_DEBUG
@@ -242,11 +244,11 @@ got_key:
goto out;
}
crypt_info->ci_ctfm = ctfm;
- crypto_ablkcipher_clear_flags(ctfm, ~0);
- crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
+ crypto_skcipher_clear_flags(ctfm, ~0);
+ crypto_tfm_set_flags(crypto_skcipher_tfm(ctfm),
CRYPTO_TFM_REQ_WEAK_KEY);
- res = crypto_ablkcipher_setkey(ctfm, raw_key,
- ext4_encryption_key_size(mode));
+ res = crypto_skcipher_setkey(ctfm, raw_key,
+ ext4_encryption_key_size(mode));
if (res)
goto out;
memzero_explicit(raw_key, sizeof(raw_key));
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 1d1bca74f844..561d7308b393 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -111,6 +111,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
int dir_has_error = 0;
struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+ if (ext4_encrypted_inode(inode)) {
+ err = ext4_get_encryption_info(inode);
+ if (err && err != -ENOKEY)
+ return err;
+ }
+
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(file, ctx);
if (err != ERR_BAD_DX_DIR) {
@@ -149,16 +155,19 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
err = ext4_map_blocks(NULL, inode, &map, 0);
if (err > 0) {
pgoff_t index = map.m_pblk >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
if (!ra_has_index(&file->f_ra, index))
page_cache_sync_readahead(
sb->s_bdev->bd_inode->i_mapping,
&file->f_ra, file,
index, 1);
- file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
bh = ext4_bread(NULL, inode, map.m_lblk, 0);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ bh = NULL;
+ goto errout;
+ }
}
if (!bh) {
@@ -276,7 +285,7 @@ errout:
static inline int is_32bit_api(void)
{
#ifdef CONFIG_COMPAT
- return is_compat_task();
+ return in_compat_syscall();
#else
return (BITS_PER_LONG == 32);
#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 750063f7a50c..349afebe21ee 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -26,6 +26,7 @@
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
+#include <linux/version.h>
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
@@ -41,6 +42,18 @@
*/
/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
* Define EXT4FS_DEBUG to produce debug messages
*/
#undef EXT4FS_DEBUG
@@ -181,9 +194,9 @@ typedef struct ext4_io_end {
struct bio *bio; /* Linked list of completed
* bios covering the extent */
unsigned int flag; /* unwritten or not */
+ atomic_t count; /* reference counter */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- atomic_t count; /* reference counter */
} ext4_io_end_t;
struct ext4_io_submit {
@@ -377,14 +390,22 @@ struct flex_groups {
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
-#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */
+
+#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
+ EXT4_IMMUTABLE_FL | \
+ EXT4_APPEND_FL | \
+ EXT4_NODUMP_FL | \
+ EXT4_NOATIME_FL | \
+ EXT4_PROJINHERIT_FL)
/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
- EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
+ EXT4_PROJINHERIT_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
@@ -554,10 +575,12 @@ enum {
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/* Request will not result in inode size update (user for fallocate) */
#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
- /* Do not take i_data_sem locking in ext4_map_blocks */
-#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
/* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100
+ /* Write zeros to newly created written extents */
+#define EXT4_GET_BLOCKS_ZERO 0x0200
+#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
+ EXT4_GET_BLOCKS_ZERO)
/*
* The bit position of these flags must not overlap with any of the
@@ -615,6 +638,46 @@ enum {
#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy)
+#ifndef FS_IOC_FSGETXATTR
+/* Until the uapi changes get merged for project quota... */
+
+#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr)
+
+/*
+ * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+ */
+struct fsxattr {
+ __u32 fsx_xflags; /* xflags field value (get/set) */
+ __u32 fsx_extsize; /* extsize field value (get/set)*/
+ __u32 fsx_nextents; /* nextents field value (get) */
+ __u32 fsx_projid; /* project identifier (get/set) */
+ unsigned char fsx_pad[12];
+};
+
+/*
+ * Flags for the fsx_xflags field
+ */
+#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */
+#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */
+#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */
+#define FS_XFLAG_APPEND 0x00000010 /* all writes append */
+#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */
+#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */
+#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */
+#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */
+#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */
+#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
+#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
+#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
+#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
+#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
+#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
+#endif /* !defined(FS_IOC_FSGETXATTR) */
+
+#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR
+#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR
+
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
@@ -727,19 +790,55 @@ struct move_extent {
<= (EXT4_GOOD_OLD_INODE_SIZE + \
(einode)->i_extra_isize)) \
+/*
+ * We use an encoding that preserves the times for extra epoch "00":
+ *
+ * extra msb of adjust for signed
+ * epoch 32-bit 32-bit tv_sec to
+ * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range
+ * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31
+ * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19
+ * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07
+ * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25
+ * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16
+ * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04
+ * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22
+ * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10
+ *
+ * Note that previous versions of the kernel on 64-bit systems would
+ * incorrectly use extra epoch bits 1,1 for dates between 1901 and
+ * 1970. e2fsck will correct this, assuming that it is run on the
+ * affected filesystem before 2242.
+ */
+
static inline __le32 ext4_encode_extra_time(struct timespec *time)
{
- return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
- (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
- ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
+ u32 extra = sizeof(time->tv_sec) > 4 ?
+ ((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0;
+ return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
}
static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
{
- if (sizeof(time->tv_sec) > 4)
- time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
- << 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ if (unlikely(sizeof(time->tv_sec) > 4 &&
+ (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0)
+ /* Handle legacy encoding of pre-1970 dates with epoch
+ * bits 1,1. We assume that by kernel version 4.20,
+ * everyone will have run fsck over the affected
+ * filesystems to correct the problem. (This
+ * backwards compatibility may be removed before this
+ * time, at the discretion of the ext4 developers.)
+ */
+ u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK;
+ if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0)
+ extra_bits = 0;
+ time->tv_sec += extra_bits << 32;
+#else
+ time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
+#endif
+ }
+ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -813,6 +912,29 @@ do { \
#include "extents_status.h"
/*
+ * Lock subclasses for i_data_sem in the ext4_inode_info structure.
+ *
+ * These are needed to avoid lockdep false positives when we need to
+ * allocate blocks to the quota inode during ext4_map_blocks(), while
+ * holding i_data_sem for a normal (non-quota) inode. Since we don't
+ * do quota tracking for the quota inode, this avoids deadlock (as
+ * well as infinite recursion, since it isn't turtles all the way
+ * down...)
+ *
+ * I_DATA_SEM_NORMAL - Used for most inodes
+ * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode
+ * where the second inode has larger inode number
+ * than the first
+ * I_DATA_SEM_QUOTA - Used for quota inodes only
+ */
+enum {
+ I_DATA_SEM_NORMAL = 0,
+ I_DATA_SEM_OTHER,
+ I_DATA_SEM_QUOTA,
+};
+
+
+/*
* fourth extended file system inode data in memory
*/
struct ext4_inode_info {
@@ -873,6 +995,15 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
+ /*
+ * i_mmap_sem is for serializing page faults with truncate / punch hole
+ * operations. We have to make sure that new page cannot be faulted in
+ * a section of the inode that is being punched. We cannot easily use
+ * i_data_sem for this since we need protection for the whole punch
+ * operation and i_data_sem ranks below transaction start so we have
+ * to occasionally drop it.
+ */
+ struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;
@@ -928,13 +1059,8 @@ struct ext4_inode_info {
* transaction reserved
*/
struct list_head i_rsv_conversion_list;
- /*
- * Completed IOs that need unwritten extents handling and don't have
- * transaction reserved
- */
- atomic_t i_ioend_count; /* Number of outstanding io_end structs */
- atomic_t i_unwritten; /* Nr. of inflight conversions pending */
struct work_struct i_rsv_conversion_work;
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -956,6 +1082,7 @@ struct ext4_inode_info {
/* Encryption params */
struct ext4_crypt_info *i_crypt_info;
#endif
+ kprojid_t i_projid;
};
/*
@@ -1211,7 +1338,7 @@ struct ext4_super_block {
#endif
/* Number of quota types we support */
-#define EXT4_MAXQUOTAS 2
+#define EXT4_MAXQUOTAS 3
/*
* fourth extended-fs super-block data in memory
@@ -1407,25 +1534,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
-static inline void ext4_set_io_unwritten_flag(struct inode *inode,
- struct ext4_io_end *io_end)
-{
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
- io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_unwritten);
- }
-}
-
-static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
-{
- return inode->i_private;
-}
-
-static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
-{
- inode->i_private = io;
-}
-
/*
* Inode dynamic state flags
*/
@@ -1717,7 +1825,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
- EXT4_FEATURE_RO_COMPAT_QUOTA)
+ EXT4_FEATURE_RO_COMPAT_QUOTA |\
+ EXT4_FEATURE_RO_COMPAT_PROJECT)
#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -1759,6 +1868,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
#define EXT4_DEF_RESUID 0
#define EXT4_DEF_RESGID 0
+/*
+ * Default project ID
+ */
+#define EXT4_DEF_PROJID 0
+
#define EXT4_DEF_INODE_READAHEAD_BLKS 32
/*
@@ -1870,7 +1984,7 @@ ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
unsigned len = le16_to_cpu(dlen);
-#if (PAGE_CACHE_SIZE >= 65536)
+#if (PAGE_SIZE >= 65536)
if (len == EXT4_MAX_REC_LEN || len == 0)
return blocksize;
return (len & 65532) | ((len & 3) << 16);
@@ -1883,7 +1997,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
BUG();
-#if (PAGE_CACHE_SIZE >= 65536)
+#if (PAGE_SIZE >= 65536)
if (len < 65536)
return cpu_to_le16(len);
if (len == blocksize) {
@@ -2191,13 +2305,17 @@ extern struct kmem_cache *ext4_crypt_info_cachep;
bool ext4_valid_contents_enc_mode(uint32_t mode);
uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size);
extern struct workqueue_struct *ext4_read_workqueue;
-struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode);
+struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode,
+ gfp_t gfp_flags);
void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
void ext4_restore_control_page(struct page *data_page);
struct page *ext4_encrypt(struct inode *inode,
- struct page *plaintext_page);
+ struct page *plaintext_page,
+ gfp_t gfp_flags);
int ext4_decrypt(struct page *page);
-int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
+int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
+ ext4_fsblk_t pblk, ext4_lblk_t len);
+extern const struct dentry_operations ext4_encrypted_d_ops;
#ifdef CONFIG_EXT4_FS_ENCRYPTION
int ext4_init_crypto(void);
@@ -2401,12 +2519,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
-int ext4_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
-int ext4_get_block_dax(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
+int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
+ struct buffer_head *bh_result, int create);
+int ext4_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
@@ -2447,9 +2567,16 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
+extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
+ ext4_fsblk_t pblk, ext4_lblk_t len);
+extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+ unsigned int map_len,
+ struct extent_status *result);
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2788,7 +2915,7 @@ do { \
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
- !mutex_is_locked(&inode->i_mutex));
+ !inode_is_locked(inode));
down_write(&EXT4_I(inode)->i_data_sem);
if (newsize > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = newsize;
@@ -2811,6 +2938,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
return changed;
}
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+ loff_t len);
+
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
@@ -2949,8 +3079,7 @@ extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
struct page *page);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
- struct dentry *dentry,
- struct inode *inode);
+ struct inode *dir, struct inode *inode);
extern int ext4_try_create_inline_dir(handle_t *handle,
struct inode *parent,
struct inode *inode);
@@ -3174,15 +3303,33 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
#define EXT4_WQ_HASH_SZ 37
#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
EXT4_WQ_HASH_SZ])
-#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
- EXT4_WQ_HASH_SZ])
extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
#define EXT4_RESIZING 0
extern int ext4_resize_begin(struct super_block *sb);
extern void ext4_resize_end(struct super_block *sb);
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+ struct ext4_io_end *io_end)
+{
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ io_end->flag |= EXT4_IO_END_UNWRITTEN;
+ atomic_inc(&EXT4_I(inode)->i_unwritten);
+ }
+}
+
+static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+ struct inode *inode = io_end->inode;
+
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+ /* Wake up anyone waiting on unwritten extent conversion */
+ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+ wake_up_all(ext4_ioend_wq(inode));
+ }
+}
+
#endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
index ac7d4e813796..1f73c29717e1 100644
--- a/fs/ext4/ext4_crypto.h
+++ b/fs/ext4/ext4_crypto.h
@@ -77,7 +77,7 @@ struct ext4_crypt_info {
char ci_data_mode;
char ci_filename_mode;
char ci_flags;
- struct crypto_ablkcipher *ci_ctfm;
+ struct crypto_skcipher *ci_ctfm;
struct key *ci_keyring_key;
char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE];
};
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 3c9381547094..8ecf84b8f5a1 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -11,7 +11,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 551353b1b17a..95bf4679ac54 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -15,7 +15,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
*/
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0;
+ /*
+ * The check for IO to unwritten extent is somewhat racy as we
+ * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
+ * dropping i_data_sem. But reserved blocks should save us in that
+ * case.
+ */
if (ext4_ext_is_unwritten(ex1) &&
(ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
atomic_read(&EXT4_I(inode)->i_unwritten) ||
@@ -2293,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
}
/*
- * ext4_ext_put_gap_in_cache:
- * calculate boundaries of the gap that the requested block fits into
- * and cache this gap
+ * ext4_ext_determine_hole - determine hole around given block
+ * @inode: inode we lookup in
+ * @path: path in extent tree to @lblk
+ * @lblk: pointer to logical block around which we want to determine hole
+ *
+ * Determine hole length (and start if easily possible) around given logical
+ * block. We don't try too hard to find the beginning of the hole but @path
+ * actually points to extent before @lblk, we provide it.
+ *
+ * The function returns the length of a hole starting at @lblk. We update @lblk
+ * to the beginning of the hole if we managed to find it.
*/
-static void
-ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
- ext4_lblk_t block)
+static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t *lblk)
{
int depth = ext_depth(inode);
- ext4_lblk_t len;
- ext4_lblk_t lblock;
struct ext4_extent *ex;
- struct extent_status es;
+ ext4_lblk_t len;
ex = path[depth].p_ext;
if (ex == NULL) {
/* there is no extent yet, so gap is [0;-] */
- lblock = 0;
+ *lblk = 0;
len = EXT_MAX_BLOCKS;
- ext_debug("cache gap(whole file):");
- } else if (block < le32_to_cpu(ex->ee_block)) {
- lblock = block;
- len = le32_to_cpu(ex->ee_block) - block;
- ext_debug("cache gap(before): %u [%u:%u]",
- block,
- le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex));
- } else if (block >= le32_to_cpu(ex->ee_block)
+ } else if (*lblk < le32_to_cpu(ex->ee_block)) {
+ len = le32_to_cpu(ex->ee_block) - *lblk;
+ } else if (*lblk >= le32_to_cpu(ex->ee_block)
+ ext4_ext_get_actual_len(ex)) {
ext4_lblk_t next;
- lblock = le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex);
+ *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
next = ext4_ext_next_allocated_block(path);
- ext_debug("cache gap(after): [%u:%u] %u",
- le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex),
- block);
- BUG_ON(next == lblock);
- len = next - lblock;
+ BUG_ON(next == *lblk);
+ len = next - *lblk;
} else {
BUG();
}
+ return len;
+}
+
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static void
+ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
+ ext4_lblk_t hole_len)
+{
+ struct extent_status es;
- ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
+ ext4_es_find_delayed_extent_range(inode, hole_start,
+ hole_start + hole_len - 1, &es);
if (es.es_len) {
/* There's delayed extent containing lblock? */
- if (es.es_lblk <= lblock)
+ if (es.es_lblk <= hole_start)
return;
- len = min(es.es_lblk - lblock, len);
+ hole_len = min(es.es_lblk - hole_start, hole_len);
}
- ext_debug(" -> %u:%u\n", lblock, len);
- ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
+ ext_debug(" -> %u:%u\n", hole_start, hole_len);
+ ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
+ EXTENT_STATUS_HOLE);
}
/*
@@ -3119,19 +3135,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
ext4_fsblk_t ee_pblock;
unsigned int ee_len;
- int ret;
ee_len = ext4_ext_get_actual_len(ex);
ee_pblock = ext4_ext_pblock(ex);
-
- if (ext4_encrypted_inode(inode))
- return ext4_encrypted_zeroout(inode, ex);
-
- ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
- if (ret > 0)
- ret = 0;
-
- return ret;
+ return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
+ ee_len);
}
/*
@@ -3935,8 +3943,8 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
static int
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath, int flags,
- unsigned int allocated, ext4_fsblk_t newblock)
+ struct ext4_ext_path **ppath,
+ unsigned int allocated)
{
struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
@@ -4015,7 +4023,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path = *ppath;
int ret = 0;
int err = 0;
- ext4_io_end_t *io = ext4_inode_aio(inode);
ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -4038,20 +4045,19 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
flags | EXT4_GET_BLOCKS_CONVERT);
if (ret <= 0)
goto out;
- /*
- * Flag the inode(non aio case) or end_io struct (aio case)
- * that this IO needs to conversion to written when IO is
- * completed
- */
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ if (flags & EXT4_GET_BLOCKS_ZERO) {
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
+ allocated);
+ if (err < 0)
+ goto out2;
+ }
ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
ppath);
if (ret >= 0) {
@@ -4283,9 +4289,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
- ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
- int set_unwritten = 0;
bool map_from_cluster = false;
ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -4347,7 +4351,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
allocated = convert_initialized_extent(
handle, inode, map, &path,
- flags, allocated, newblock);
+ allocated);
goto out2;
} else if (!ext4_ext_is_unwritten(ex))
goto out;
@@ -4368,11 +4372,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* we couldn't try to create block if create flag is zero
*/
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ ext4_lblk_t hole_start, hole_len;
+
+ hole_start = map->m_lblk;
+ hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
/*
* put just found gap into cache to speed up
* subsequent requests
*/
- ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+ ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
+
+ /* Update hole_len to reflect hole size after map->m_lblk */
+ if (hole_start != map->m_lblk)
+ hole_len -= map->m_lblk - hole_start;
+ map->m_pblk = 0;
+ map->m_len = min_t(unsigned int, map->m_len, hole_len);
+
goto out2;
}
@@ -4482,15 +4497,6 @@ got_allocated_blocks:
if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
ext4_ext_mark_unwritten(&newex);
map->m_flags |= EXT4_MAP_UNWRITTEN;
- /*
- * io_end structure was created for every IO write to an
- * unwritten extent. To avoid unnecessary conversion,
- * here we flag the IO that really needs the conversion.
- * For non asycn direct IO case, flag the inode state
- * that we need to perform conversion when IO is done.
- */
- if (flags & EXT4_GET_BLOCKS_PRE_IO)
- set_unwritten = 1;
}
err = 0;
@@ -4501,14 +4507,6 @@ got_allocated_blocks:
err = ext4_ext_insert_extent(handle, inode, &path,
&newex, flags);
- if (!err && set_unwritten) {
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN);
- }
-
if (err && free_on_err) {
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
@@ -4685,10 +4683,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
if (len <= EXT_UNWRITTEN_MAX_LEN)
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
-
/*
* credits to insert 1 extent into extent tree
*/
@@ -4752,8 +4746,6 @@ retry:
goto retry;
}
- ext4_inode_resume_unlocked_dio(inode);
-
return ret > 0 ? ret2 : ret;
}
@@ -4770,7 +4762,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
- struct address_space *mapping = inode->i_mapping;
unsigned int blkbits = inode->i_blkbits;
trace_ext4_zero_range(inode, offset, len, mode);
@@ -4786,17 +4777,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + len - 1);
- if (ret)
- return ret;
- }
-
- /*
* Round up offset. This is not fallocate, we neet to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
@@ -4817,7 +4797,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
else
max_blocks -= lblk;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* Indirect files do not support unwritten extnets
@@ -4839,6 +4819,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4847,7 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
round_down(offset, 1 << blkbits)) >> blkbits,
new_size, flags, mode);
if (ret)
- goto out_mutex;
+ goto out_dio;
}
@@ -4856,16 +4840,23 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
EXT4_EX_NOCACHE);
- /* Now release the pages and zero block aligned part of pages*/
+ /*
+ * Prevent page faults from reinstantiating pages we have
+ * released from page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_mmap_sem);
+ goto out_dio;
+ }
+ /* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
-
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
if (ret)
goto out_dio;
}
@@ -4909,7 +4900,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
out_dio:
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -4980,7 +4971,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* We only support preallocation for extent-based files only
@@ -4998,8 +4989,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
+ ext4_inode_resume_unlocked_dio(inode);
if (ret)
goto out;
@@ -5008,7 +5004,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
EXT4_I(inode)->i_sync_tid);
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
return ret;
}
@@ -5494,21 +5490,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
- /*
- * Need to round down offset to be aligned with page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
-
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- return ret;
-
- /* Take mutex lock */
- mutex_lock(&inode->i_mutex);
-
+ inode_lock(inode);
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
@@ -5524,17 +5506,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- truncate_pagecache(inode, ioffset);
-
/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ /*
+ * Need to round down offset to be aligned with page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+ /*
+ * Write tail of the last page before removed range since it will get
+ * removed from the page cache below.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+ if (ret)
+ goto out_mmap;
+ /*
+ * Write data that will be shifted to preserve them when discarding
+ * page cache below. We are also protected from pages becoming dirty
+ * by i_mmap_sem.
+ */
+ ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+ LLONG_MAX);
+ if (ret)
+ goto out_mmap;
+ truncate_pagecache(inode, ioffset);
+
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_dio;
+ goto out_mmap;
}
down_write(&EXT4_I(inode)->i_data_sem);
@@ -5573,10 +5581,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -5627,21 +5636,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
- /*
- * Need to round down to align start offset to page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
-
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- return ret;
-
- /* Take mutex lock */
- mutex_lock(&inode->i_mutex);
-
+ inode_lock(inode);
/* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
ret = -EOPNOTSUPP;
@@ -5660,17 +5655,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
- truncate_pagecache(inode, ioffset);
-
/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
+ /*
+ * Need to round down to align start offset to page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+ /* Write out all dirty pages */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+ LLONG_MAX);
+ if (ret)
+ goto out_mmap;
+ truncate_pagecache(inode, ioffset);
+
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_dio;
+ goto out_mmap;
}
/* Expand file to avoid data loss if there is error while shifting */
@@ -5741,10 +5751,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -5779,8 +5790,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
- BUG_ON(!mutex_is_locked(&inode1->i_mutex));
- BUG_ON(!mutex_is_locked(&inode2->i_mutex));
+ BUG_ON(!inode_is_locked(inode1));
+ BUG_ON(!inode_is_locked(inode2));
*erp = ext4_es_remove_extent(inode1, lblk1, count);
if (unlikely(*erp))
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ac748b3af1c1..e38b987ac7f5 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -823,8 +823,8 @@ out:
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
- if (!ext4_es_is_referenced(es))
- ext4_es_set_referenced(es);
+ if (!ext4_es_is_referenced(es1))
+ ext4_es_set_referenced(es1);
stats->es_stats_cache_hits++;
} else {
stats->es_stats_cache_misses++;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 113837e7ba98..fa2208bae2e1 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(iocb->ki_filp);
- struct mutex *aio_mutex = NULL;
struct blk_plug plug;
int o_direct = iocb->ki_flags & IOCB_DIRECT;
+ int unaligned_aio = 0;
int overwrite = 0;
ssize_t ret;
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
/*
- * Unaligned direct AIO must be serialized; see comment above
- * In the case of O_APPEND, assume that we must always serialize
+ * Unaligned direct AIO must be serialized among each other as zeroing
+ * of partial blocks of two competing unaligned AIOs can result in data
+ * corruption.
*/
- if (o_direct &&
- ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
!is_sync_kiocb(iocb) &&
- (iocb->ki_flags & IOCB_APPEND ||
- ext4_unaligned_aio(inode, from, iocb->ki_pos))) {
- aio_mutex = ext4_aio_mutex(inode);
- mutex_lock(aio_mutex);
+ ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
+ unaligned_aio = 1;
ext4_unwritten_wait(inode);
}
- mutex_lock(&inode->i_mutex);
- ret = generic_write_checks(iocb, from);
- if (ret <= 0)
- goto out;
-
/*
* If we have encountered a bitmap-format file, the size limit
* is smaller than s_maxbytes, which is for extent-mapped files.
@@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
blk_start_plug(&plug);
/* check whether we do a DIO overwrite or not */
- if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+ if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
!file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
struct ext4_map_blocks map;
unsigned int blkbits = inode->i_blkbits;
@@ -169,7 +167,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
ret = __generic_file_write_iter(iocb, from);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret > 0) {
ssize_t err;
@@ -181,55 +179,43 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (o_direct)
blk_finish_plug(&plug);
- if (aio_mutex)
- mutex_unlock(aio_mutex);
return ret;
out:
- mutex_unlock(&inode->i_mutex);
- if (aio_mutex)
- mutex_unlock(aio_mutex);
+ inode_unlock(inode);
return ret;
}
#ifdef CONFIG_FS_DAX
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
- struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
- loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
- int err;
- if (!uptodate)
- return;
- WARN_ON(!buffer_unwritten(bh));
- err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int result;
handle_t *handle = NULL;
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
bool write = vmf->flags & FAULT_FLAG_WRITE;
if (write) {
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
- }
+ } else
+ down_read(&EXT4_I(inode)->i_mmap_sem);
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
- result = __dax_fault(vma, vmf, ext4_get_block_dax,
- ext4_end_io_unwritten);
+ result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
- }
+ } else
+ up_read(&EXT4_I(inode)->i_mmap_sem);
return result;
}
@@ -246,44 +232,73 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
if (write) {
sb_start_pagefault(sb);
file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
ext4_chunk_trans_blocks(inode,
PMD_SIZE / PAGE_SIZE));
- }
+ } else
+ down_read(&EXT4_I(inode)->i_mmap_sem);
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
result = __dax_pmd_fault(vma, addr, pmd, flags,
- ext4_get_block_dax, ext4_end_io_unwritten);
+ ext4_dax_mmap_get_block, NULL);
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
- }
+ } else
+ up_read(&EXT4_I(inode)->i_mmap_sem);
return result;
}
-static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+/*
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
+ * handler we check for races agaist truncate. Note that since we cycle through
+ * i_mmap_sem, we are sure that also any hole punching that began before we
+ * were called is finished by now and so if it included part of the file we
+ * are working on, our pte will get unmapped and the check for pte_same() in
+ * wp_pfn_shared() fails. Thus fault gets retried and things work out as
+ * desired.
+ */
+static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block_dax,
- ext4_end_io_unwritten);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ loff_t size;
+ int ret;
+
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+ else
+ ret = dax_pfn_mkwrite(vma, vmf);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+ sb_end_pagefault(sb);
+
+ return ret;
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.pmd_fault = ext4_dax_pmd_fault,
- .page_mkwrite = ext4_dax_mkwrite,
- .pfn_mkwrite = dax_pfn_mkwrite,
+ .page_mkwrite = ext4_dax_fault,
+ .pfn_mkwrite = ext4_dax_pfn_mkwrite,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
- .fault = filemap_fault,
+ .fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
@@ -314,6 +329,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct vfsmount *mnt = filp->f_path.mnt;
+ struct dentry *dir;
struct path path;
char buf[64], *cp;
int ret;
@@ -357,6 +373,18 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
if (ext4_encryption_info(inode) == NULL)
return -ENOKEY;
}
+
+ dir = dget_parent(file_dentry(filp));
+ if (ext4_encrypted_inode(d_inode(dir)) &&
+ !ext4_is_child_context_consistent_with_parent(d_inode(dir), inode)) {
+ ext4_warning(inode->i_sb,
+ "Inconsistent encryption contexts: %lu/%lu\n",
+ (unsigned long) d_inode(dir)->i_ino,
+ (unsigned long) inode->i_ino);
+ dput(dir);
+ return -EPERM;
+ }
+ dput(dir);
/*
* Set up the jbd2_inode if we are opening the inode for
* writing and the journal is present
@@ -387,7 +415,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
*/
static int ext4_find_unwritten_pgoff(struct inode *inode,
int whence,
- struct ext4_map_blocks *map,
+ ext4_lblk_t end_blk,
loff_t *offset)
{
struct pagevec pvec;
@@ -402,10 +430,10 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
- endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+ endoff = (loff_t)end_blk << blkbits;
- index = startoff >> PAGE_CACHE_SHIFT;
- end = endoff >> PAGE_CACHE_SHIFT;
+ index = startoff >> PAGE_SHIFT;
+ end = endoff >> PAGE_SHIFT;
pagevec_init(&pvec, 0);
do {
@@ -520,18 +548,17 @@ out:
static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_map_blocks map;
struct extent_status es;
ext4_lblk_t start, last, end;
loff_t dataoff, isize;
int blkbits;
- int ret = 0;
+ int ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
isize = i_size_read(inode);
if (offset >= isize) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -ENXIO;
}
@@ -542,44 +569,35 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
dataoff = offset;
do {
- map.m_lblk = last;
- map.m_len = end - last + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
- if (last != start)
- dataoff = (loff_t)last << blkbits;
- break;
+ ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
+ if (ret <= 0) {
+ /* No extent found -> no data */
+ if (ret == 0)
+ ret = -ENXIO;
+ inode_unlock(inode);
+ return ret;
}
- /*
- * If there is a delay extent at this offset,
- * it will be as a data.
- */
- ext4_es_find_delayed_extent_range(inode, last, last, &es);
- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
- if (last != start)
- dataoff = (loff_t)last << blkbits;
+ last = es.es_lblk;
+ if (last != start)
+ dataoff = (loff_t)last << blkbits;
+ if (!ext4_es_is_unwritten(&es))
break;
- }
/*
* If there is a unwritten extent at this offset,
* it will be as a data or a hole according to page
* cache that has data or not.
*/
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int unwritten;
- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
- &map, &dataoff);
- if (unwritten)
- break;
- }
-
- last++;
+ if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+ es.es_lblk + es.es_len, &dataoff))
+ break;
+ last += es.es_len;
dataoff = (loff_t)last << blkbits;
+ cond_resched();
} while (last <= end);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (dataoff > isize)
return -ENXIO;
@@ -593,18 +611,17 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_map_blocks map;
struct extent_status es;
ext4_lblk_t start, last, end;
loff_t holeoff, isize;
int blkbits;
- int ret = 0;
+ int ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
isize = i_size_read(inode);
if (offset >= isize) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -ENXIO;
}
@@ -615,47 +632,33 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
holeoff = offset;
do {
- map.m_lblk = last;
- map.m_len = end - last + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
- last += ret;
- holeoff = (loff_t)last << blkbits;
- continue;
+ ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
+ if (ret < 0) {
+ inode_unlock(inode);
+ return ret;
}
-
- /*
- * If there is a delay extent at this offset,
- * we will skip this extent.
- */
- ext4_es_find_delayed_extent_range(inode, last, last, &es);
- if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
- last = es.es_lblk + es.es_len;
- holeoff = (loff_t)last << blkbits;
- continue;
+ /* Found a hole? */
+ if (ret == 0 || es.es_lblk > last) {
+ if (last != start)
+ holeoff = (loff_t)last << blkbits;
+ break;
}
-
/*
* If there is a unwritten extent at this offset,
* it will be as a data or a hole according to page
* cache that has data or not.
*/
- if (map.m_flags & EXT4_MAP_UNWRITTEN) {
- int unwritten;
- unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
- &map, &holeoff);
- if (!unwritten) {
- last += ret;
- holeoff = (loff_t)last << blkbits;
- continue;
- }
- }
+ if (ext4_es_is_unwritten(&es) &&
+ ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+ last + es.es_len, &holeoff))
+ break;
- /* find a hole */
- break;
+ last += es.es_len;
+ holeoff = (loff_t)last << blkbits;
+ cond_resched();
} while (last <= end);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (holeoff > isize)
holeoff = isize;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1b8024d26f65..237b877d316d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,7 +76,6 @@ static int ext4_init_inode_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks and inodes use to prevent
* allocation, essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -191,8 +190,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
- if (err)
+ if (err) {
+ ext4_error(sb, "Failed to init inode bitmap for group "
+ "%u: %d", block_group, err);
goto out;
+ }
return bh;
}
ext4_unlock_group(sb, block_group);
@@ -785,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
sbi = EXT4_SB(sb);
/*
- * Initalize owners and quota early so that we don't have to account
+ * Initialize owners and quota early so that we don't have to account
* for quota initialization worst case in standard inode creating
* transaction
*/
@@ -799,6 +801,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
inode->i_gid = dir->i_gid;
} else
inode_init_owner(inode, dir, mode);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
+ ei->i_projid = EXT4_I(dir)->i_projid;
+ else
+ ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+
err = dquot_initialize(inode);
if (err)
goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 355ef9c36c87..3027fa681de5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
goto got_it;
}
- /* Next simple case - plain lookup or failed read of indirect block */
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+ /* Next simple case - plain lookup failed */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
+ int i;
+
+ /* Count number blocks in a subtree under 'partial' */
+ count = 1;
+ for (i = 0; partial + i != chain + depth - 1; i++)
+ count *= epb;
+ /* Fill in size of a hole we found */
+ map->m_pblk = 0;
+ map->m_len = min_t(unsigned int, map->m_len, count);
+ goto cleanup;
+ }
+
+ /* Failed read of indirect block */
+ if (err == -EIO)
goto cleanup;
/*
@@ -693,21 +708,21 @@ retry:
}
if (IS_DAX(inode))
ret = dax_do_io(iocb, inode, iter, offset,
- ext4_get_block, NULL, 0);
+ ext4_dio_get_block, NULL, 0);
else
ret = __blockdev_direct_IO(iocb, inode,
inode->i_sb->s_bdev, iter,
- offset, ext4_get_block, NULL,
- NULL, 0);
+ offset, ext4_dio_get_block,
+ NULL, NULL, 0);
inode_dio_end(inode);
} else {
locked:
if (IS_DAX(inode))
ret = dax_do_io(iocb, inode, iter, offset,
- ext4_get_block, NULL, DIO_LOCKING);
+ ext4_dio_get_block, NULL, DIO_LOCKING);
else
ret = blockdev_direct_IO(iocb, inode, iter, offset,
- ext4_get_block);
+ ext4_dio_get_block);
if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d884989cc83d..7bc6c855cc18 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -482,7 +482,7 @@ static int ext4_read_inline_page(struct inode *inode, struct page *page)
ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
flush_dcache_page(page);
kunmap_atomic(kaddr);
- zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ zero_user_segment(page, len, PAGE_SIZE);
SetPageUptodate(page);
brelse(iloc.bh);
@@ -507,7 +507,7 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
if (!page->index)
ret = ext4_read_inline_page(inode, page);
else if (!PageUptodate(page)) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
}
@@ -581,9 +581,10 @@ retry:
if (ret)
goto out;
- if (ext4_should_dioread_nolock(inode))
- ret = __block_write_begin(page, from, to, ext4_get_block_write);
- else
+ if (ext4_should_dioread_nolock(inode)) {
+ ret = __block_write_begin(page, from, to,
+ ext4_get_block_unwritten);
+ } else
ret = __block_write_begin(page, from, to, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
@@ -594,7 +595,7 @@ retry:
if (ret) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
ext4_orphan_add(handle, inode);
up_write(&EXT4_I(inode)->xattr_sem);
@@ -620,7 +621,7 @@ retry:
out:
if (page) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
if (sem_held)
up_write(&EXT4_I(inode)->xattr_sem);
@@ -689,7 +690,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
if (!ext4_has_inline_data(inode)) {
ret = 0;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto out_up_read;
}
@@ -814,7 +815,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
if (ret) {
up_read(&EXT4_I(inode)->xattr_sem);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ext4_truncate_failed_write(inode);
return ret;
}
@@ -828,7 +829,7 @@ out:
up_read(&EXT4_I(inode)->xattr_sem);
if (page) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return ret;
}
@@ -918,7 +919,7 @@ retry_journal:
out_release_page:
up_read(&EXT4_I(inode)->xattr_sem);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out_journal:
ext4_journal_stop(handle);
out:
@@ -946,7 +947,7 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
i_size_changed = 1;
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
@@ -995,12 +996,11 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
*/
static int ext4_add_dirent_to_inline(handle_t *handle,
struct ext4_filename *fname,
- struct dentry *dentry,
+ struct inode *dir,
struct inode *inode,
struct ext4_iloc *iloc,
void *inline_start, int inline_size)
{
- struct inode *dir = d_inode(dentry->d_parent);
int err;
struct ext4_dir_entry_2 *de;
@@ -1245,12 +1245,11 @@ out:
* the new created block.
*/
int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
- struct dentry *dentry, struct inode *inode)
+ struct inode *dir, struct inode *inode)
{
int ret, inline_size;
void *inline_start;
struct ext4_iloc iloc;
- struct inode *dir = d_inode(dentry->d_parent);
ret = ext4_get_inode_loc(dir, &iloc);
if (ret)
@@ -1264,7 +1263,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
EXT4_INLINE_DOTDOT_SIZE;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
- ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc,
+ ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc,
inline_start, inline_size);
if (ret != -ENOSPC)
goto out;
@@ -1285,7 +1284,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
if (inline_size) {
inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
- ret = ext4_add_dirent_to_inline(handle, fname, dentry,
+ ret = ext4_add_dirent_to_inline(handle, fname, dir,
inode, &iloc, inline_start,
inline_size);
@@ -1698,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle,
if (err)
goto out;
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_mark_inode_dirty(handle, dir);
if (unlikely(err))
goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e8d620a484f6..981a1fc30eaa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode)
}
truncate_inode_pages_final(&inode->i_data);
- WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
}
@@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode)
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages_final(&inode->i_data);
- WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
-
/*
* Protect us against freezing - iput() caller didn't have to have any
* protection against it
@@ -383,6 +380,21 @@ static int __check_block_validity(struct inode *inode, const char *func,
return 0;
}
+int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
+ ext4_lblk_t len)
+{
+ int ret;
+
+ if (ext4_encrypted_inode(inode))
+ return ext4_encrypted_zeroout(inode, lblk, pblk, len);
+
+ ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
+ if (ret > 0)
+ ret = 0;
+
+ return ret;
+}
+
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
@@ -403,8 +415,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* out taking i_data_sem. So at the time the unwritten extent
* could be converted.
*/
- if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- down_read(&EXT4_I(inode)->i_data_sem);
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -412,8 +423,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
retval = ext4_ind_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
}
- if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- up_read((&EXT4_I(inode)->i_data_sem));
+ up_read((&EXT4_I(inode)->i_data_sem));
/*
* We don't check m_len because extent will be collpased in status
@@ -445,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocated.
- * if create==0 and the blocks are pre-allocated and unwritten block,
- * the result buffer head is unmapped. If the create ==1, it will make sure
- * the buffer head is mapped.
+ * On success, it returns the number of blocks being mapped or allocated. if
+ * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
+ * is marked as unwritten. If the create == 1, it will mark @map as mapped.
*
* It returns 0 if plain look up failed (blocks have not been allocated), in
- * that case, buffer head is unmapped
+ * that case, @map is returned as unmapped but we still do fill map->m_len to
+ * indicate the length of a hole starting at map->m_lblk.
*
* It returns the error in case of allocation failure.
*/
@@ -494,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
retval = map->m_len;
map->m_len = retval;
} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+ map->m_pblk = 0;
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
retval = 0;
} else {
BUG_ON(1);
@@ -509,8 +524,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* Try to see if we can get the block without requesting a new
* file system block.
*/
- if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- down_read(&EXT4_I(inode)->i_data_sem);
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -541,8 +555,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (ret < 0)
retval = ret;
}
- if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- up_read((&EXT4_I(inode)->i_data_sem));
+ up_read((&EXT4_I(inode)->i_data_sem));
found:
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -626,13 +639,29 @@ found:
}
/*
+ * We have to zeroout blocks before inserting them into extent
+ * status tree. Otherwise someone could look them up there and
+ * use them before they are really zeroed.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO &&
+ map->m_flags & EXT4_MAP_MAPPED &&
+ map->m_flags & EXT4_MAP_NEW) {
+ ret = ext4_issue_zeroout(inode, map->m_lblk,
+ map->m_pblk, map->m_len);
+ if (ret) {
+ retval = ret;
+ goto out_sem;
+ }
+ }
+
+ /*
* If the extent has been zeroed out, we don't need to update
* extent status tree.
*/
if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
if (ext4_es_is_written(&es))
- goto has_zeroout;
+ goto out_sem;
}
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -643,11 +672,13 @@ found:
status |= EXTENT_STATUS_DELAYED;
ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
- if (ret < 0)
+ if (ret < 0) {
retval = ret;
+ goto out_sem;
+ }
}
-has_zeroout:
+out_sem:
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
ret = check_block_validity(inode, map);
@@ -657,16 +688,39 @@ has_zeroout:
return retval;
}
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
+/*
+ * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
+ * we have to be careful as someone else may be manipulating b_state as well.
+ */
+static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
+{
+ unsigned long old_state;
+ unsigned long new_state;
+
+ flags &= EXT4_MAP_FLAGS;
+
+ /* Dummy buffer_head? Set non-atomically. */
+ if (!bh->b_page) {
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
+ return;
+ }
+ /*
+ * Someone else may be modifying b_state. Be careful! This is ugly but
+ * once we get rid of using bh as a container for mapping information
+ * to pass to / from get_block functions, this can go away.
+ */
+ do {
+ old_state = READ_ONCE(bh->b_state);
+ new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
+ } while (unlikely(
+ cmpxchg(&bh->b_state, old_state, new_state) != old_state));
+}
static int _ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int flags)
{
- handle_t *handle = ext4_journal_current_handle();
struct ext4_map_blocks map;
- int ret = 0, started = 0;
- int dio_credits;
+ int ret = 0;
if (ext4_has_inline_data(inode))
return -ERANGE;
@@ -674,43 +728,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
- /* Direct IO write... */
- if (map.m_len > DIO_MAX_BLOCKS)
- map.m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
- dio_credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- return ret;
- }
- started = 1;
- }
-
- ret = ext4_map_blocks(handle, inode, &map, flags);
+ ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
+ flags);
if (ret > 0) {
- ext4_io_end_t *io_end = ext4_inode_aio(inode);
-
map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
- if (IS_DAX(inode) && buffer_unwritten(bh)) {
- /*
- * dgc: I suspect unwritten conversion on ext4+DAX is
- * fundamentally broken here when there are concurrent
- * read/write in progress on this inode.
- */
- WARN_ON_ONCE(io_end);
- bh->b_assoc_map = inode->i_mapping;
- bh->b_private = (void *)(unsigned long)iblock;
- }
- if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
- set_buffer_defer_completion(bh);
+ ext4_update_bh_state(bh, map.m_flags);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
- if (started)
- ext4_journal_stop(handle);
return ret;
}
@@ -722,6 +747,153 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
}
/*
+ * Get block function used when preparing for buffered write if we require
+ * creating an unwritten extent if blocks haven't been allocated. The extent
+ * will be converted to written after the IO is complete.
+ */
+int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+/*
+ * Get blocks function for the cases that need to start a transaction -
+ * generally difference cases of direct IO and DAX IO. It also handles retries
+ * in case of ENOSPC.
+ */
+static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int flags)
+{
+ int dio_credits;
+ handle_t *handle;
+ int retries = 0;
+ int ret;
+
+ /* Trim mapping request to maximum we can map at once for DIO */
+ if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
+ bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
+ dio_credits = ext4_chunk_trans_blocks(inode,
+ bh_result->b_size >> inode->i_blkbits);
+retry:
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ret = _ext4_get_block(inode, iblock, bh_result, flags);
+ ext4_journal_stop(handle);
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ return ret;
+}
+
+/* Get block function for DIO reads and writes to inodes without extents */
+int ext4_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ if (!create)
+ return _ext4_get_block(inode, iblock, bh, 0);
+ return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
+}
+
+/*
+ * Get block function for AIO DIO writes when we create unwritten extent if
+ * blocks are not allocated yet. The extent will be converted to written
+ * after IO is complete.
+ */
+static int ext4_dio_get_block_unwritten_async(struct inode *inode,
+ sector_t iblock, struct buffer_head *bh_result, int create)
+{
+ int ret;
+
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ ret = ext4_get_block_trans(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+
+ /*
+ * When doing DIO using unwritten extents, we need io_end to convert
+ * unwritten extents to written on IO completion. We allocate io_end
+ * once we spot unwritten extent and store it in b_private. Generic
+ * DIO code keeps b_private set and furthermore passes the value to
+ * our completion callback in 'private' argument.
+ */
+ if (!ret && buffer_unwritten(bh_result)) {
+ if (!bh_result->b_private) {
+ ext4_io_end_t *io_end;
+
+ io_end = ext4_init_io_end(inode, GFP_KERNEL);
+ if (!io_end)
+ return -ENOMEM;
+ bh_result->b_private = io_end;
+ ext4_set_io_unwritten_flag(inode, io_end);
+ }
+ set_buffer_defer_completion(bh_result);
+ }
+
+ return ret;
+}
+
+/*
+ * Get block function for non-AIO DIO writes when we create unwritten extent if
+ * blocks are not allocated yet. The extent will be converted to written
+ * after IO is complete from ext4_ext_direct_IO() function.
+ */
+static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
+ sector_t iblock, struct buffer_head *bh_result, int create)
+{
+ int ret;
+
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ ret = ext4_get_block_trans(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
+
+ /*
+ * Mark inode as having pending DIO writes to unwritten extents.
+ * ext4_ext_direct_IO() checks this flag and converts extents to
+ * written.
+ */
+ if (!ret && buffer_unwritten(bh_result))
+ ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+
+ return ret;
+}
+
+static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+
+ ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ /* We don't expect handle for direct IO */
+ WARN_ON_ONCE(ext4_journal_current_handle());
+
+ ret = _ext4_get_block(inode, iblock, bh_result, 0);
+ /*
+ * Blocks should have been preallocated! ext4_file_write_iter() checks
+ * that.
+ */
+ WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
+
+ return ret;
+}
+
+
+/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -879,14 +1051,11 @@ int do_journal_get_write_access(handle_t *handle,
return ret;
}
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
-
#ifdef CONFIG_EXT4_FS_ENCRYPTION
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
struct inode *inode = page->mapping->host;
unsigned block_start, block_end;
@@ -898,15 +1067,15 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
bool decrypt = false;
BUG_ON(!PageLocked(page));
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > PAGE_SIZE);
+ BUG_ON(to > PAGE_SIZE);
BUG_ON(from > to);
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);
bbits = ilog2(blocksize);
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ block = (sector_t)page->index << (PAGE_SHIFT - bbits);
for (bh = head, block_start = 0; bh != head || !block_start;
block++, block_start = block_end, bh = bh->b_this_page) {
@@ -988,8 +1157,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
* we allocate blocks but write fails for some reason
*/
needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
- index = pos >> PAGE_CACHE_SHIFT;
- from = pos & (PAGE_CACHE_SIZE - 1);
+ index = pos >> PAGE_SHIFT;
+ from = pos & (PAGE_SIZE - 1);
to = from + len;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -1017,7 +1186,7 @@ retry_grab:
retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
- page_cache_release(page);
+ put_page(page);
return PTR_ERR(handle);
}
@@ -1025,7 +1194,7 @@ retry_journal:
if (page->mapping != mapping) {
/* The page got truncated from under us */
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ext4_journal_stop(handle);
goto retry_grab;
}
@@ -1035,13 +1204,14 @@ retry_journal:
#ifdef CONFIG_EXT4_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
ret = ext4_block_write_begin(page, pos, len,
- ext4_get_block_write);
+ ext4_get_block_unwritten);
else
ret = ext4_block_write_begin(page, pos, len,
ext4_get_block);
#else
if (ext4_should_dioread_nolock(inode))
- ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+ ret = __block_write_begin(page, pos, len,
+ ext4_get_block_unwritten);
else
ret = __block_write_begin(page, pos, len, ext4_get_block);
#endif
@@ -1080,7 +1250,7 @@ retry_journal:
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_journal;
- page_cache_release(page);
+ put_page(page);
return ret;
}
*pagep = page;
@@ -1123,7 +1293,7 @@ static int ext4_write_end(struct file *file,
ret = ext4_jbd2_file_inode(handle, inode);
if (ret) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto errout;
}
}
@@ -1143,7 +1313,7 @@ static int ext4_write_end(struct file *file,
*/
i_size_changed = ext4_update_inode_size(inode, pos + copied);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos);
@@ -1227,7 +1397,7 @@ static int ext4_journalled_write_end(struct file *file,
int size_changed = 0;
trace_ext4_journalled_write_end(inode, pos, len, copied);
- from = pos & (PAGE_CACHE_SIZE - 1);
+ from = pos & (PAGE_SIZE - 1);
to = from + len;
BUG_ON(!ext4_handle_valid(handle));
@@ -1251,7 +1421,7 @@ static int ext4_journalled_write_end(struct file *file,
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos);
@@ -1365,7 +1535,7 @@ static void ext4_da_page_release_reservation(struct page *page,
int num_clusters;
ext4_fsblk_t lblk;
- BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+ BUG_ON(stop > PAGE_SIZE || stop < length);
head = page_buffers(page);
bh = head;
@@ -1381,7 +1551,7 @@ static void ext4_da_page_release_reservation(struct page *page,
clear_buffer_delay(bh);
} else if (contiguous_blks) {
lblk = page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
lblk += (curr_off >> inode->i_blkbits) -
contiguous_blks;
ext4_es_remove_extent(inode, lblk, contiguous_blks);
@@ -1391,7 +1561,7 @@ static void ext4_da_page_release_reservation(struct page *page,
} while ((bh = bh->b_this_page) != head);
if (contiguous_blks) {
- lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
ext4_es_remove_extent(inode, lblk, contiguous_blks);
}
@@ -1400,7 +1570,7 @@ static void ext4_da_page_release_reservation(struct page *page,
* need to release the reserved space for that cluster. */
num_clusters = EXT4_NUM_B2C(sbi, to_release);
while (num_clusters > 0) {
- lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+ lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
((num_clusters - 1) << sbi->s_cluster_bits);
if (sbi->s_cluster_ratio == 1 ||
!ext4_find_delalloc_cluster(inode, lblk))
@@ -1447,8 +1617,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
end = mpd->next_page - 1;
if (invalidate) {
ext4_lblk_t start, last;
- start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ start = index << (PAGE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_SHIFT - inode->i_blkbits);
ext4_es_remove_extent(inode, start, last - start + 1);
}
@@ -1464,7 +1634,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
if (invalidate) {
- block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ block_invalidatepage(page, 0, PAGE_SIZE);
ClearPageUptodate(page);
}
unlock_page(page);
@@ -1669,7 +1839,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return ret;
map_bh(bh, inode->i_sb, map.m_pblk);
- bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ ext4_update_bh_state(bh, map.m_flags);
if (buffer_unwritten(bh)) {
/* A delayed write to unwritten bh should be marked
@@ -1835,10 +2005,10 @@ static int ext4_writepage(struct page *page,
trace_ext4_writepage(page);
size = i_size_read(inode);
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
else
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
page_bufs = page_buffers(page);
/*
@@ -1862,7 +2032,7 @@ static int ext4_writepage(struct page *page,
ext4_bh_delay_or_unwritten)) {
redirty_page_for_writepage(wbc, page);
if ((current->flags & PF_MEMALLOC) ||
- (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
+ (inode->i_sb->s_blocksize == PAGE_SIZE)) {
/*
* For memory cleaning there's no point in writing only
* some buffers. So just bail out. Warn if we came here
@@ -1904,10 +2074,10 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
int err;
BUG_ON(page->index != mpd->first_page);
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
else
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
clear_page_dirty_for_io(page);
err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
if (!err)
@@ -2041,7 +2211,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
int nr_pages, i;
struct inode *inode = mpd->inode;
struct buffer_head *head, *bh;
- int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+ int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
pgoff_t start, end;
ext4_lblk_t lblk;
sector_t pblock;
@@ -2102,7 +2272,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
* supports blocksize < pagesize as we will try to
* convert potentially unmapped parts of inode.
*/
- mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+ mpd->io_submit.io_end->size += PAGE_SIZE;
/* Page fully mapped - let IO run! */
err = mpage_submit_page(mpd, page);
if (err < 0) {
@@ -2254,7 +2424,7 @@ update_disksize:
* Update on-disk size after IO is submitted. Races with
* truncate are avoided by checking i_size under i_data_sem.
*/
- disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+ disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
if (disksize > EXT4_I(inode)->i_disksize) {
int err2;
loff_t i_size;
@@ -2390,7 +2560,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
mpd->next_page = page->index + 1;
/* Add all dirty buffers to mpd */
lblk = ((ext4_lblk_t)page->index) <<
- (PAGE_CACHE_SHIFT - blkbits);
+ (PAGE_SHIFT - blkbits);
head = page_buffers(page);
err = mpage_process_page_bufs(mpd, head, head, lblk);
if (err <= 0)
@@ -2434,6 +2604,10 @@ static int ext4_writepages(struct address_space *mapping,
trace_ext4_writepages(inode, wbc);
+ if (dax_mapping(mapping))
+ return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+ wbc);
+
/*
* No pages to write? This is mainly a kludge to avoid starting
* a transaction for special inodes like journal inode on last iput()
@@ -2471,7 +2645,7 @@ static int ext4_writepages(struct address_space *mapping,
* We may need to convert up to one extent per block in
* the page and we may dirty the inode.
*/
- rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+ rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits);
}
/*
@@ -2502,8 +2676,8 @@ static int ext4_writepages(struct address_space *mapping,
mpd.first_page = writeback_index;
mpd.last_page = -1;
} else {
- mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
- mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
+ mpd.first_page = wbc->range_start >> PAGE_SHIFT;
+ mpd.last_page = wbc->range_end >> PAGE_SHIFT;
}
mpd.inode = inode;
@@ -2662,7 +2836,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
handle_t *handle;
- index = pos >> PAGE_CACHE_SHIFT;
+ index = pos >> PAGE_SHIFT;
if (ext4_nonda_switch(inode->i_sb)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -2705,7 +2879,7 @@ retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
ext4_da_write_credits(inode, pos, len));
if (IS_ERR(handle)) {
- page_cache_release(page);
+ put_page(page);
return PTR_ERR(handle);
}
@@ -2713,7 +2887,7 @@ retry_journal:
if (page->mapping != mapping) {
/* The page got truncated from under us */
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ext4_journal_stop(handle);
goto retry_grab;
}
@@ -2741,7 +2915,7 @@ retry_journal:
ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_journal;
- page_cache_release(page);
+ put_page(page);
return ret;
}
@@ -2789,7 +2963,7 @@ static int ext4_da_write_end(struct file *file,
len, copied, page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
- start = pos & (PAGE_CACHE_SIZE - 1);
+ start = pos & (PAGE_SIZE - 1);
end = start + copied - 1;
/*
@@ -3011,7 +3185,7 @@ static int __ext4_journalled_invalidatepage(struct page *page,
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0 && length == PAGE_CACHE_SIZE)
+ if (offset == 0 && length == PAGE_SIZE)
ClearPageChecked(page);
return jbd2_journal_invalidatepage(journal, page, offset, length);
@@ -3040,58 +3214,105 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
-/*
- * ext4_get_block used when preparing for a DIO write or buffer write.
- * We allocate an uinitialized extent if blocks haven't been allocated.
- * The extent will be converted to initialized after the IO is complete.
- */
-int ext4_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+#ifdef CONFIG_FS_DAX
+int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
- ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
- inode->i_ino, create);
- return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
-}
+ int ret, err;
+ int credits;
+ struct ext4_map_blocks map;
+ handle_t *handle = NULL;
+ int flags = 0;
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+ ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
inode->i_ino, create);
- return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_NO_LOCK);
-}
+ map.m_lblk = iblock;
+ map.m_len = bh_result->b_size >> inode->i_blkbits;
+ credits = ext4_chunk_trans_blocks(inode, map.m_len);
+ if (create) {
+ flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+ }
-int ext4_get_block_dax(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
- if (create)
- flags |= EXT4_GET_BLOCKS_CREATE;
- ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
- inode->i_ino, create);
- return _ext4_get_block(inode, iblock, bh_result, flags);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (create) {
+ err = ext4_journal_stop(handle);
+ if (ret >= 0 && err < 0)
+ ret = err;
+ }
+ if (ret <= 0)
+ goto out;
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int err2;
+
+ /*
+ * We are protected by i_mmap_sem so we know block cannot go
+ * away from under us even though we dropped i_data_sem.
+ * Convert extent to written and write zeros there.
+ *
+ * Note: We may get here even when create == 0.
+ */
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ err = ext4_map_blocks(handle, inode, &map,
+ EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
+ if (err < 0)
+ ret = err;
+ err2 = ext4_journal_stop(handle);
+ if (err2 < 0 && ret > 0)
+ ret = err2;
+ }
+out:
+ WARN_ON_ONCE(ret == 0 && create);
+ if (ret > 0) {
+ map_bh(bh_result, inode->i_sb, map.m_pblk);
+ /*
+ * At least for now we have to clear BH_New so that DAX code
+ * doesn't attempt to zero blocks again in a racy way.
+ */
+ map.m_flags &= ~EXT4_MAP_NEW;
+ ext4_update_bh_state(bh_result, map.m_flags);
+ bh_result->b_size = map.m_len << inode->i_blkbits;
+ ret = 0;
+ }
+ return ret;
}
+#endif
-static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
- ext4_io_end_t *io_end = iocb->private;
+ ext4_io_end_t *io_end = private;
/* if not async direct IO just return */
if (!io_end)
- return;
+ return 0;
ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
- iocb->private, io_end->inode->i_ino, iocb, offset,
- size);
+ io_end, io_end->inode->i_ino, iocb, offset, size);
- iocb->private = NULL;
+ /*
+ * Error during AIO DIO. We cannot convert unwritten extents as the
+ * data was not written. Just clear the unwritten flag and drop io_end.
+ */
+ if (size <= 0) {
+ ext4_clear_io_unwritten_flag(io_end);
+ size = 0;
+ }
io_end->offset = offset;
io_end->size = size;
ext4_put_io_end(io_end);
+
+ return 0;
}
/*
@@ -3124,7 +3345,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
get_block_t *get_block_func = NULL;
int dio_flags = 0;
loff_t final_size = offset + count;
- ext4_io_end_t *io_end = NULL;
/* Use the old path for reads and writes beyond i_size. */
if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
@@ -3143,24 +3363,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
/* If we do a overwrite dio, i_mutex locking can be released */
overwrite = *((int *)iocb->private);
- if (overwrite) {
- down_read(&EXT4_I(inode)->i_data_sem);
- mutex_unlock(&inode->i_mutex);
- }
+ if (overwrite)
+ inode_unlock(inode);
/*
* We could direct write to holes and fallocate.
*
- * Allocated blocks to fill the hole are marked as
- * unwritten to prevent parallel buffered read to expose
- * the stale data before DIO complete the data IO.
+ * Allocated blocks to fill the hole are marked as unwritten to prevent
+ * parallel buffered read to expose the stale data before DIO complete
+ * the data IO.
*
- * As to previously fallocated extents, ext4 get_block will
- * just simply mark the buffer mapped but still keep the
- * extents unwritten.
+ * As to previously fallocated extents, ext4 get_block will just simply
+ * mark the buffer mapped but still keep the extents unwritten.
*
- * For non AIO case, we will convert those unwritten extents
- * to written after return back from blockdev_direct_IO.
+ * For non AIO case, we will convert those unwritten extents to written
+ * after return back from blockdev_direct_IO. That way we save us from
+ * allocating io_end structure and also the overhead of offloading
+ * the extent convertion to a workqueue.
*
* For async DIO, the conversion needs to be deferred when the
* IO is completed. The ext4 end_io callback function will be
@@ -3168,30 +3387,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* case, we allocate an io_end structure to hook to the iocb.
*/
iocb->private = NULL;
- ext4_inode_aio_set(inode, NULL);
- if (!is_sync_kiocb(iocb)) {
- io_end = ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end) {
- ret = -ENOMEM;
- goto retake_lock;
- }
- /*
- * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
- */
- iocb->private = ext4_get_io_end(io_end);
- /*
- * we save the io structure for current async direct
- * IO, so that later ext4_map_blocks() could flag the
- * io structure whether there is a unwritten extents
- * needs to be converted when IO is completed.
- */
- ext4_inode_aio_set(inode, io_end);
- }
-
- if (overwrite) {
- get_block_func = ext4_get_block_write_nolock;
+ if (overwrite)
+ get_block_func = ext4_dio_get_block_overwrite;
+ else if (is_sync_kiocb(iocb)) {
+ get_block_func = ext4_dio_get_block_unwritten_sync;
+ dio_flags = DIO_LOCKING;
} else {
- get_block_func = ext4_get_block_write;
+ get_block_func = ext4_dio_get_block_unwritten_async;
dio_flags = DIO_LOCKING;
}
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -3206,27 +3408,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
get_block_func,
ext4_end_io_dio, NULL, dio_flags);
- /*
- * Put our reference to io_end. This can free the io_end structure e.g.
- * in sync IO case or in case of error. It can even perform extent
- * conversion if all bios we submitted finished before we got here.
- * Note that in that case iocb->private can be already set to NULL
- * here.
- */
- if (io_end) {
- ext4_inode_aio_set(inode, NULL);
- ext4_put_io_end(io_end);
- /*
- * When no IO was submitted ext4_end_io_dio() was not
- * called so we have to put iocb's reference.
- */
- if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
- WARN_ON(iocb->private != io_end);
- WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
- ext4_put_io_end(io_end);
- iocb->private = NULL;
- }
- }
if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
int err;
@@ -3241,14 +3422,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
-retake_lock:
if (iov_iter_rw(iter) == WRITE)
inode_dio_end(inode);
/* take i_mutex locking again if we do a ovewrite dio */
- if (overwrite) {
- up_read(&EXT4_I(inode)->i_data_sem);
- mutex_lock(&inode->i_mutex);
- }
+ if (overwrite)
+ inode_lock(inode);
return ret;
}
@@ -3376,8 +3554,8 @@ void ext4_set_aops(struct inode *inode)
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
- ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ ext4_fsblk_t index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
@@ -3385,14 +3563,14 @@ static int __ext4_block_zero_page_range(handle_t *handle,
struct page *page;
int err = 0;
- page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ page = find_or_create_page(mapping, from >> PAGE_SHIFT,
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
if (!page)
return -ENOMEM;
blocksize = inode->i_sb->s_blocksize;
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
@@ -3434,7 +3612,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
ext4_encrypted_inode(inode)) {
/* We expect the key to be set. */
BUG_ON(!ext4_has_encryption_key(inode));
- BUG_ON(blocksize != PAGE_CACHE_SIZE);
+ BUG_ON(blocksize != PAGE_SIZE);
WARN_ON_ONCE(ext4_decrypt(page));
}
}
@@ -3458,7 +3636,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -3473,7 +3651,7 @@ static int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
struct inode *inode = mapping->host;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize = inode->i_sb->s_blocksize;
unsigned max = blocksize - (offset & (blocksize - 1));
@@ -3498,7 +3676,7 @@ static int ext4_block_zero_page_range(handle_t *handle,
static int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned length;
unsigned blocksize;
struct inode *inode = mapping->host;
@@ -3559,6 +3737,35 @@ int ext4_can_truncate(struct inode *inode)
}
/*
+ * We have to make sure i_disksize gets properly updated before we truncate
+ * page cache due to hole punching or zero range. Otherwise i_disksize update
+ * can get lost as it may have been postponed to submission of writeback but
+ * that will never happen after we truncate page cache.
+ */
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+ loff_t len)
+{
+ handle_t *handle;
+ loff_t size = i_size_read(inode);
+
+ WARN_ON(!inode_is_locked(inode));
+ if (offset > size || offset + len < size)
+ return 0;
+
+ if (EXT4_I(inode)->i_disksize >= size)
+ return 0;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ext4_update_i_disksize(inode, size);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+ return 0;
+}
+
+/*
* ext4_punch_hole: punches a hole in a file by releaseing the blocks
* associated with the given offset and length
*
@@ -3595,7 +3802,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
return ret;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
@@ -3607,7 +3814,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
*/
if (offset + length > inode->i_size) {
length = inode->i_size +
- PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
offset;
}
@@ -3623,17 +3830,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ /*
+ * Prevent page faults from reinstantiating pages we have released from
+ * page cache.
+ */
+ down_write(&EXT4_I(inode)->i_mmap_sem);
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset)
+ if (last_block_offset > first_block_offset) {
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
+ if (ret)
+ goto out_dio;
truncate_pagecache_range(inode, first_block_offset,
last_block_offset);
-
- /* Wait all existing dio workers, newcomers will block on i_mutex */
- ext4_inode_block_unlocked_dio(inode);
- inode_dio_wait(inode);
+ }
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
@@ -3680,19 +3896,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- /* Now release the pages again to reduce race window */
- if (last_block_offset > first_block_offset)
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
-
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
ext4_journal_stop(handle);
out_dio:
+ up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -3762,7 +3974,7 @@ void ext4_truncate(struct inode *inode)
* have i_mutex locked because it's not necessary.
*/
if (!(inode->i_state & (I_NEW|I_FREEING)))
- WARN_ON(!mutex_is_locked(&inode->i_mutex));
+ WARN_ON(!inode_is_locked(inode));
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
@@ -4010,7 +4222,7 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
- if (test_opt(inode->i_sb, DAX))
+ if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
new_fl |= S_DAX;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
@@ -4076,6 +4288,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
EXT4_I(inode)->i_inline_off = 0;
}
+int ext4_get_projid(struct inode *inode, kprojid_t *projid)
+{
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
+ return -EOPNOTSUPP;
+ *projid = EXT4_I(inode)->i_projid;
+ return 0;
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
@@ -4087,6 +4307,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
int block;
uid_t i_uid;
gid_t i_gid;
+ projid_t i_projid;
inode = iget_locked(sb, ino);
if (!inode)
@@ -4136,12 +4357,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+ i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
+ else
+ i_projid = EXT4_DEF_PROJID;
+
if (!(test_opt(inode->i_sb, NO_UID32))) {
i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
i_uid_write(inode, i_uid);
i_gid_write(inode, i_gid);
+ ei->i_projid = make_kprojid(&init_user_ns, i_projid);
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
@@ -4283,6 +4512,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
}
+ inode_nohighmem(inode);
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
@@ -4439,6 +4669,7 @@ static int ext4_do_update_inode(handle_t *handle,
int need_datasync = 0, set_large_file = 0;
uid_t i_uid;
gid_t i_gid;
+ projid_t i_projid;
spin_lock(&ei->i_raw_lock);
@@ -4451,6 +4682,7 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
i_uid = i_uid_read(inode);
i_gid = i_gid_read(inode);
+ i_projid = from_kprojid(&init_user_ns, ei->i_projid);
if (!(test_opt(inode->i_sb, NO_UID32))) {
raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
@@ -4528,6 +4760,15 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le16(ei->i_extra_isize);
}
}
+
+ BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+ i_projid != EXT4_DEF_PROJID);
+
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+ EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+ raw_inode->i_projid = cpu_to_le32(i_projid);
+
ext4_inode_csum_set(inode, raw_inode, ei);
spin_unlock(&ei->i_raw_lock);
if (inode->i_sb->s_flags & MS_LAZYTIME)
@@ -4648,23 +4889,23 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
tid_t commit_tid = 0;
int ret;
- offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ offset = inode->i_size & (PAGE_SIZE - 1);
/*
* All buffers in the last page remain valid? Then there's nothing to
- * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+ * do. We do the check mainly to optimize the common PAGE_SIZE ==
* blocksize case
*/
- if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+ if (offset > PAGE_SIZE - (1 << inode->i_blkbits))
return;
while (1) {
page = find_lock_page(inode->i_mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
+ inode->i_size >> PAGE_SHIFT);
if (!page)
return;
ret = __ext4_journalled_invalidatepage(page, offset,
- PAGE_CACHE_SIZE - offset);
+ PAGE_SIZE - offset);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (ret != -EBUSY)
return;
commit_tid = 0;
@@ -4823,6 +5064,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
} else
ext4_wait_for_tail_page_commit(inode);
}
+ down_write(&EXT4_I(inode)->i_mmap_sem);
/*
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
@@ -4830,6 +5072,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
truncate_pagecache(inode, inode->i_size);
if (shrink)
ext4_truncate(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
}
if (!rc) {
@@ -5081,6 +5324,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
might_sleep();
trace_ext4_mark_inode_dirty(inode, _RET_IP_);
err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ return err;
if (ext4_handle_valid(handle) &&
EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
!ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
@@ -5111,9 +5356,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
}
}
}
- if (!err)
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
- return err;
+ return ext4_mark_iloc_dirty(handle, inode, &iloc);
}
/*
@@ -5278,12 +5521,14 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
+
+ down_read(&EXT4_I(inode)->i_mmap_sem);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
- ret = __block_page_mkwrite(vma, vmf,
+ ret = block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
} while (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
@@ -5299,10 +5544,10 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
}
- if (page->index == size >> PAGE_CACHE_SHIFT)
- len = size & ~PAGE_CACHE_MASK;
+ if (page->index == size >> PAGE_SHIFT)
+ len = size & ~PAGE_MASK;
else
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
/*
* Return if we have all the buffers mapped. This avoids the need to do
* journal_start/journal_stop which can block and take a long time
@@ -5320,7 +5565,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
unlock_page(page);
/* OK, we need to fill the hole... */
if (ext4_should_dioread_nolock(inode))
- get_block = ext4_get_block_write;
+ get_block = ext4_get_block_unwritten;
else
get_block = ext4_get_block;
retry_alloc:
@@ -5330,10 +5575,10 @@ retry_alloc:
ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = __block_page_mkwrite(vma, vmf, get_block);
+ ret = block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+ PAGE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
ext4_journal_stop(handle);
@@ -5347,6 +5592,86 @@ retry_alloc:
out_ret:
ret = block_page_mkwrite_return(ret);
out:
+ up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}
+
+int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ int err;
+
+ down_read(&EXT4_I(inode)->i_mmap_sem);
+ err = filemap_fault(vma, vmf);
+ up_read(&EXT4_I(inode)->i_mmap_sem);
+
+ return err;
+}
+
+/*
+ * Find the first extent at or after @lblk in an inode that is not a hole.
+ * Search for @map_len blocks at most. The extent is returned in @result.
+ *
+ * The function returns 1 if we found an extent. The function returns 0 in
+ * case there is no extent at or after @lblk and in that case also sets
+ * @result->es_len to 0. In case of error, the error code is returned.
+ */
+int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+ unsigned int map_len, struct extent_status *result)
+{
+ struct ext4_map_blocks map;
+ struct extent_status es = {};
+ int ret;
+
+ map.m_lblk = lblk;
+ map.m_len = map_len;
+
+ /*
+ * For non-extent based files this loop may iterate several times since
+ * we do not determine full hole size.
+ */
+ while (map.m_len > 0) {
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ /* There's extent covering m_lblk? Just return it. */
+ if (ret > 0) {
+ int status;
+
+ ext4_es_store_pblock(result, map.m_pblk);
+ result->es_lblk = map.m_lblk;
+ result->es_len = map.m_len;
+ if (map.m_flags & EXT4_MAP_UNWRITTEN)
+ status = EXTENT_STATUS_UNWRITTEN;
+ else
+ status = EXTENT_STATUS_WRITTEN;
+ ext4_es_store_status(result, status);
+ return 1;
+ }
+ ext4_es_find_delayed_extent_range(inode, map.m_lblk,
+ map.m_lblk + map.m_len - 1,
+ &es);
+ /* Is delalloc data before next block in extent tree? */
+ if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
+ ext4_lblk_t offset = 0;
+
+ if (es.es_lblk < lblk)
+ offset = lblk - es.es_lblk;
+ result->es_lblk = es.es_lblk + offset;
+ ext4_es_store_pblock(result,
+ ext4_es_pblock(&es) + offset);
+ result->es_len = es.es_len - offset;
+ ext4_es_store_status(result, ext4_es_status(&es));
+
+ return 1;
+ }
+ /* There's a hole at m_lblk, advance us after it */
+ map.m_lblk += map.m_len;
+ map_len -= map.m_len;
+ map.m_len = map_len;
+ cond_resched();
+ }
+ result->es_len = 0;
+ return 0;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5e872fd40e5e..eae5917c534e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -14,6 +14,7 @@
#include <linux/mount.h>
#include <linux/file.h>
#include <linux/random.h>
+#include <linux/quotaops.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -202,6 +203,238 @@ static int uuid_is_zero(__u8 u[16])
return 1;
}
+static int ext4_ioctl_setflags(struct inode *inode,
+ unsigned int flags)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ handle_t *handle = NULL;
+ int err = -EPERM, migrate = 0;
+ struct ext4_iloc iloc;
+ unsigned int oldflags, mask, i;
+ unsigned int jflag;
+
+ /* Is it quota file? Do not allow user to mess with it */
+ if (IS_NOQUOTA(inode))
+ goto flags_out;
+
+ oldflags = ei->i_flags;
+
+ /* The JOURNAL_DATA flag is modifiable only by root */
+ jflag = flags & EXT4_JOURNAL_DATA_FL;
+
+ /*
+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ * the relevant capability.
+ *
+ * This test looks nicer. Thanks to Pauline Middelink
+ */
+ if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ goto flags_out;
+ }
+
+ /*
+ * The JOURNAL_DATA flag can only be changed by
+ * the relevant capability.
+ */
+ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if (!capable(CAP_SYS_RESOURCE))
+ goto flags_out;
+ }
+ if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
+ migrate = 1;
+
+ if (flags & EXT4_EOFBLOCKS_FL) {
+ /* we don't support adding EOFBLOCKS flag */
+ if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+ } else if (oldflags & EXT4_EOFBLOCKS_FL)
+ ext4_truncate(inode);
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto flags_out;
+ }
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto flags_err;
+
+ for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+ if (!(mask & EXT4_FL_USER_MODIFIABLE))
+ continue;
+ if (mask & flags)
+ ext4_set_inode_flag(inode, i);
+ else
+ ext4_clear_inode_flag(inode, i);
+ }
+
+ ext4_set_inode_flags(inode);
+ inode->i_ctime = ext4_current_time(inode);
+
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+ ext4_journal_stop(handle);
+ if (err)
+ goto flags_out;
+
+ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+ err = ext4_change_inode_journal_flag(inode, jflag);
+ if (err)
+ goto flags_out;
+ if (migrate) {
+ if (flags & EXT4_EXTENTS_FL)
+ err = ext4_ext_migrate(inode);
+ else
+ err = ext4_ind_migrate(inode);
+ }
+
+flags_out:
+ return err;
+}
+
+#ifdef CONFIG_QUOTA
+static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+{
+ struct inode *inode = file_inode(filp);
+ struct super_block *sb = inode->i_sb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int err, rc;
+ handle_t *handle;
+ kprojid_t kprojid;
+ struct ext4_iloc iloc;
+ struct ext4_inode *raw_inode;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+ if (projid != EXT4_DEF_PROJID)
+ return -EOPNOTSUPP;
+ else
+ return 0;
+ }
+
+ if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE)
+ return -EOPNOTSUPP;
+
+ kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
+
+ if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
+ return 0;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ err = -EPERM;
+ inode_lock(inode);
+ /* Is it quota file? Do not allow user to mess with it */
+ if (IS_NOQUOTA(inode))
+ goto out_unlock;
+
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (err)
+ goto out_unlock;
+
+ raw_inode = ext4_raw_inode(&iloc);
+ if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
+ err = -EOVERFLOW;
+ brelse(iloc.bh);
+ goto out_unlock;
+ }
+ brelse(iloc.bh);
+
+ dquot_initialize(inode);
+
+ handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+ EXT4_QUOTA_INIT_BLOCKS(sb) +
+ EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_unlock;
+ }
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_stop;
+
+ if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
+ struct dquot *transfer_to[MAXQUOTAS] = { };
+
+ transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
+ if (transfer_to[PRJQUOTA]) {
+ err = __dquot_transfer(inode, transfer_to);
+ dqput(transfer_to[PRJQUOTA]);
+ if (err)
+ goto out_dirty;
+ }
+ }
+ EXT4_I(inode)->i_projid = kprojid;
+ inode->i_ctime = ext4_current_time(inode);
+out_dirty:
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+out_stop:
+ ext4_journal_stop(handle);
+out_unlock:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return err;
+}
+#else
+static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+{
+ if (projid != EXT4_DEF_PROJID)
+ return -EOPNOTSUPP;
+ return 0;
+}
+#endif
+
+/* Transfer internal flags to xflags */
+static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
+{
+ __u32 xflags = 0;
+
+ if (iflags & EXT4_SYNC_FL)
+ xflags |= FS_XFLAG_SYNC;
+ if (iflags & EXT4_IMMUTABLE_FL)
+ xflags |= FS_XFLAG_IMMUTABLE;
+ if (iflags & EXT4_APPEND_FL)
+ xflags |= FS_XFLAG_APPEND;
+ if (iflags & EXT4_NODUMP_FL)
+ xflags |= FS_XFLAG_NODUMP;
+ if (iflags & EXT4_NOATIME_FL)
+ xflags |= FS_XFLAG_NOATIME;
+ if (iflags & EXT4_PROJINHERIT_FL)
+ xflags |= FS_XFLAG_PROJINHERIT;
+ return xflags;
+}
+
+/* Transfer xflags flags to internal */
+static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
+{
+ unsigned long iflags = 0;
+
+ if (xflags & FS_XFLAG_SYNC)
+ iflags |= EXT4_SYNC_FL;
+ if (xflags & FS_XFLAG_IMMUTABLE)
+ iflags |= EXT4_IMMUTABLE_FL;
+ if (xflags & FS_XFLAG_APPEND)
+ iflags |= EXT4_APPEND_FL;
+ if (xflags & FS_XFLAG_NODUMP)
+ iflags |= EXT4_NODUMP_FL;
+ if (xflags & FS_XFLAG_NOATIME)
+ iflags |= EXT4_NOATIME_FL;
+ if (xflags & FS_XFLAG_PROJINHERIT)
+ iflags |= EXT4_PROJINHERIT_FL;
+
+ return iflags;
+}
+
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -217,11 +450,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
return put_user(flags, (int __user *) arg);
case EXT4_IOC_SETFLAGS: {
- handle_t *handle = NULL;
- int err, migrate = 0;
- struct ext4_iloc iloc;
- unsigned int oldflags, mask, i;
- unsigned int jflag;
+ int err;
if (!inode_owner_or_capable(inode))
return -EACCES;
@@ -235,90 +464,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags = ext4_mask_flags(inode->i_mode, flags);
- err = -EPERM;
- mutex_lock(&inode->i_mutex);
- /* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode))
- goto flags_out;
-
- oldflags = ei->i_flags;
-
- /* The JOURNAL_DATA flag is modifiable only by root */
- jflag = flags & EXT4_JOURNAL_DATA_FL;
-
- /*
- * The IMMUTABLE and APPEND_ONLY flags can only be changed by
- * the relevant capability.
- *
- * This test looks nicer. Thanks to Pauline Middelink
- */
- if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE))
- goto flags_out;
- }
-
- /*
- * The JOURNAL_DATA flag can only be changed by
- * the relevant capability.
- */
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
- if (!capable(CAP_SYS_RESOURCE))
- goto flags_out;
- }
- if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
- migrate = 1;
-
- if (flags & EXT4_EOFBLOCKS_FL) {
- /* we don't support adding EOFBLOCKS flag */
- if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
- err = -EOPNOTSUPP;
- goto flags_out;
- }
- } else if (oldflags & EXT4_EOFBLOCKS_FL)
- ext4_truncate(inode);
-
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
- goto flags_out;
- }
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
- goto flags_err;
-
- for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
- if (!(mask & EXT4_FL_USER_MODIFIABLE))
- continue;
- if (mask & flags)
- ext4_set_inode_flag(inode, i);
- else
- ext4_clear_inode_flag(inode, i);
- }
-
- ext4_set_inode_flags(inode);
- inode->i_ctime = ext4_current_time(inode);
-
- err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-flags_err:
- ext4_journal_stop(handle);
- if (err)
- goto flags_out;
-
- if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
- err = ext4_change_inode_journal_flag(inode, jflag);
- if (err)
- goto flags_out;
- if (migrate) {
- if (flags & EXT4_EXTENTS_FL)
- err = ext4_ext_migrate(inode);
- else
- err = ext4_ind_migrate(inode);
- }
-
-flags_out:
- mutex_unlock(&inode->i_mutex);
+ inode_lock(inode);
+ err = ext4_ioctl_setflags(inode, flags);
+ inode_unlock(inode);
mnt_drop_write_file(filp);
return err;
}
@@ -349,7 +497,7 @@ flags_out:
goto setversion_out;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
@@ -364,7 +512,7 @@ flags_out:
ext4_journal_stop(handle);
unlock_out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
setversion_out:
mnt_drop_write_file(filp);
return err;
@@ -435,6 +583,11 @@ group_extend_out:
"Online defrag not supported with bigalloc");
err = -EOPNOTSUPP;
goto mext_out;
+ } else if (IS_DAX(inode)) {
+ ext4_msg(sb, KERN_ERR,
+ "Online defrag not supported with DAX");
+ err = -EOPNOTSUPP;
+ goto mext_out;
}
err = mnt_want_write_file(filp);
@@ -510,9 +663,9 @@ group_add_out:
* ext4_ext_swap_inode_data before we switch the
* inode format to prevent read.
*/
- mutex_lock(&(inode->i_mutex));
+ inode_lock((inode));
err = ext4_ext_migrate(inode);
- mutex_unlock(&(inode->i_mutex));
+ inode_unlock((inode));
mnt_drop_write_file(filp);
return err;
}
@@ -689,6 +842,60 @@ encryption_policy_out:
return -EOPNOTSUPP;
#endif
}
+ case EXT4_IOC_FSGETXATTR:
+ {
+ struct fsxattr fa;
+
+ memset(&fa, 0, sizeof(struct fsxattr));
+ ext4_get_inode_flags(ei);
+ fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+ fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
+ EXT4_I(inode)->i_projid);
+ }
+
+ if (copy_to_user((struct fsxattr __user *)arg,
+ &fa, sizeof(fa)))
+ return -EFAULT;
+ return 0;
+ }
+ case EXT4_IOC_FSSETXATTR:
+ {
+ struct fsxattr fa;
+ int err;
+
+ if (copy_from_user(&fa, (struct fsxattr __user *)arg,
+ sizeof(fa)))
+ return -EFAULT;
+
+ /* Make sure caller has proper permission */
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ flags = ext4_xflags_to_iflags(fa.fsx_xflags);
+ flags = ext4_mask_flags(inode->i_mode, flags);
+
+ inode_lock(inode);
+ flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
+ (flags & EXT4_FL_XFLAG_VISIBLE);
+ err = ext4_ioctl_setflags(inode, flags);
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ if (err)
+ return err;
+
+ err = ext4_ioctl_setproject(filp, fa.fsx_projid);
+ if (err)
+ return err;
+
+ return 0;
+ }
default:
return -ENOTTY;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b4b3c1f91814..eeeade76012e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -11,7 +11,7 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
- * You should have received a copy of the GNU General Public Licens
+ * You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
@@ -119,7 +119,7 @@ MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
*
*
* one block each for bitmap and buddy information. So for each group we
- * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
+ * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
* blocksize) blocks. So it can have information regarding groups_per_page
* which is blocks_per_page/2
*
@@ -807,7 +807,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
*
* one block each for bitmap and buddy information.
* So for each group we take up 2 blocks. A page can
- * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
+ * contain blocks_per_page (PAGE_SIZE / blocksize) blocks.
* So it can have information regarding groups_per_page which
* is blocks_per_page/2
*
@@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
* for this page; do not hold this lock when calling this routine!
*/
-static int ext4_mb_init_cache(struct page *page, char *incore)
+static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
{
ext4_group_t ngroups;
int blocksize;
@@ -839,7 +839,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
sb = inode->i_sb;
ngroups = ext4_get_groups_count(sb);
blocksize = 1 << inode->i_blkbits;
- blocks_per_page = PAGE_CACHE_SIZE / blocksize;
+ blocks_per_page = PAGE_SIZE / blocksize;
groups_per_page = blocks_per_page >> 1;
if (groups_per_page == 0)
@@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* allocate buffer_heads to read bitmaps */
if (groups_per_page > 1) {
i = sizeof(struct buffer_head *) * groups_per_page;
- bh = kzalloc(i, GFP_NOFS);
+ bh = kzalloc(i, gfp);
if (bh == NULL) {
err = -ENOMEM;
goto out;
@@ -983,7 +983,7 @@ out:
* are on the same page e4b->bd_buddy_page is NULL and return value is 0.
*/
static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
- ext4_group_t group, struct ext4_buddy *e4b)
+ ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
{
struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
int block, pnum, poff;
@@ -993,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL;
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ blocks_per_page = PAGE_SIZE / sb->s_blocksize;
/*
* the buddy cache inode stores the block bitmap
* and buddy information in consecutive blocks.
@@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
block = group * 2;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (!page)
return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
@@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
block++;
pnum = block / blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (!page)
return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
@@ -1028,11 +1028,11 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page) {
unlock_page(e4b->bd_bitmap_page);
- page_cache_release(e4b->bd_bitmap_page);
+ put_page(e4b->bd_bitmap_page);
}
if (e4b->bd_buddy_page) {
unlock_page(e4b->bd_buddy_page);
- page_cache_release(e4b->bd_buddy_page);
+ put_page(e4b->bd_buddy_page);
}
}
@@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
* calling this routine!
*/
static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
{
struct ext4_group_info *this_grp;
@@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
* The call to ext4_mb_get_buddy_page_lock will mark the
* page accessed.
*/
- ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
/*
* somebody initialized the group
@@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
}
page = e4b.bd_bitmap_page;
- ret = ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL, gfp);
if (ret)
goto err;
if (!PageUptodate(page)) {
@@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
}
/* init buddy cache */
page = e4b.bd_buddy_page;
- ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+ ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
if (ret)
goto err;
if (!PageUptodate(page)) {
@@ -1109,8 +1109,8 @@ err:
* calling this routine!
*/
static noinline_for_stack int
-ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
- struct ext4_buddy *e4b)
+ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b, gfp_t gfp)
{
int blocks_per_page;
int block;
@@ -1125,7 +1125,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
might_sleep();
mb_debug(1, "load group %u\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ blocks_per_page = PAGE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
e4b->bd_blkbits = sb->s_blocksize_bits;
@@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* we need full data about the group
* to make a good selection
*/
- ret = ext4_mb_init_group(sb, group);
+ ret = ext4_mb_init_group(sb, group, gfp);
if (ret)
return ret;
}
@@ -1167,12 +1167,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* is yet to initialize the same. So
* wait for it to initialize.
*/
- page_cache_release(page);
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ put_page(page);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL, gfp);
if (ret) {
unlock_page(page);
goto err;
@@ -1203,12 +1203,13 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
- page_cache_release(page);
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ put_page(page);
+ page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
+ gfp);
if (ret) {
unlock_page(page);
goto err;
@@ -1237,22 +1238,28 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
err:
if (page)
- page_cache_release(page);
+ put_page(page);
if (e4b->bd_bitmap_page)
- page_cache_release(e4b->bd_bitmap_page);
+ put_page(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
- page_cache_release(e4b->bd_buddy_page);
+ put_page(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
return ret;
}
+static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b)
+{
+ return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
+}
+
static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page)
- page_cache_release(e4b->bd_bitmap_page);
+ put_page(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
- page_cache_release(e4b->bd_buddy_page);
+ put_page(e4b->bd_buddy_page);
}
@@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- int ret = ext4_mb_init_group(ac->ac_sb, group);
+ int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
if (ret)
return ret;
}
@@ -2285,7 +2292,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
if (group == 0)
seq_puts(seq, "#group: free frags first ["
" 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
- " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]");
+ " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
sizeof(struct ext4_group_info);
@@ -2826,8 +2833,8 @@ static void ext4_free_data_callback(struct super_block *sb,
/* No more items in the per group rb tree
* balance refcounts from ext4_mb_free_metadata()
*/
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
+ put_page(e4b.bd_buddy_page);
+ put_page(e4b.bd_bitmap_page);
}
ext4_unlock_group(sb, entry->efd_group);
kmem_cache_free(ext4_free_data_cachep, entry);
@@ -3333,8 +3340,8 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
atomic_inc(&pa->pa_count);
return pa;
}
- cur_distance = abs64(goal_block - cpa->pa_pstart);
- new_distance = abs64(goal_block - pa->pa_pstart);
+ cur_distance = abs(goal_block - cpa->pa_pstart);
+ new_distance = abs(goal_block - pa->pa_pstart);
if (cur_distance <= new_distance)
return cpa;
@@ -4378,9 +4385,9 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
- page_cache_release(ac->ac_bitmap_page);
+ put_page(ac->ac_bitmap_page);
if (ac->ac_buddy_page)
- page_cache_release(ac->ac_buddy_page);
+ put_page(ac->ac_buddy_page);
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
@@ -4592,8 +4599,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
* otherwise we'll refresh it from
* on-disk bitmap and lose not-yet-available
* blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
+ get_page(e4b->bd_buddy_page);
+ get_page(e4b->bd_bitmap_page);
}
while (*n) {
parent = *n;
@@ -4695,16 +4702,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
}
/*
- * We need to make sure we don't reuse the freed block until
- * after the transaction is committed, which we can do by
- * treating the block as metadata, below. We make an
- * exception if the inode is to be written in writeback mode
- * since writeback mode has weak data consistency guarantees.
- */
- if (!ext4_should_writeback_data(inode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
-
- /*
* If the extent to be freed does not begin on a cluster
* boundary, we need to deal with partial clusters at the
* beginning and end of the extent. Normally we will free
@@ -4738,14 +4735,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
int i;
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
for (i = 0; i < count; i++) {
cond_resched();
- bh = sb_find_get_block(inode->i_sb, block + i);
- if (!bh)
- continue;
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, bh, block + i);
+ if (is_metadata)
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
}
}
@@ -4815,16 +4811,23 @@ do_more:
#endif
trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
- err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
+ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
+ GFP_NOFS|__GFP_NOFAIL);
if (err)
goto error_return;
- if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
+ /*
+ * We need to make sure we don't reuse the freed block until after the
+ * transaction is committed. We make an exception if the inode is to be
+ * written in writeback mode since writeback mode has weak data
+ * consistency guarantees.
+ */
+ if (ext4_handle_valid(handle) &&
+ ((flags & EXT4_FREE_BLOCKS_METADATA) ||
+ !ext4_should_writeback_data(inode))) {
struct ext4_free_data *new_entry;
/*
- * blocks being freed are metadata. these blocks shouldn't
- * be used until this transaction is committed
- *
* We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
* to fail.
*/
@@ -5217,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
grp = ext4_get_group_info(sb, group);
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- ret = ext4_mb_init_group(sb, group);
+ ret = ext4_mb_init_group(sb, group, GFP_NOFS);
if (ret)
break;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d634e183b4d4..3ef1df6ae9ec 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,18 +23,6 @@
#include "ext4.h"
/*
- * with AGGRESSIVE_CHECK allocator runs consistency checks over
- * structures. these checks slow things down a lot
- */
-#define AGGRESSIVE_CHECK__
-
-/*
- * with DOUBLE_CHECK defined mballoc creates persistent in-core
- * bitmaps, maintains and uses them to check for double allocations
- */
-#define DOUBLE_CHECK__
-
-/*
*/
#ifdef CONFIG_EXT4_DEBUG
extern ushort ext4_mballoc_debug;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a4651894cc33..364ea4d4a943 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* blocks.
*
* While converting to extents we need not
- * update the orignal inode i_blocks for extent blocks
+ * update the original inode i_blocks for extent blocks
* via quota APIs. The quota update happened via tmp_inode already.
*/
spin_lock(&inode->i_lock);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 0a512aa81bf7..24445275d330 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
- brelse(*bh);
- *bh = NULL;
ret = -EIO;
goto warn_exit;
}
-
mmp = (struct mmp_struct *)((*bh)->b_data);
- if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
ret = -EFSCORRUPTED;
- else if (!ext4_mmp_csum_verify(sb, mmp))
+ goto warn_exit;
+ }
+ if (!ext4_mmp_csum_verify(sb, mmp)) {
ret = -EFSBADCRC;
- else
- return 0;
-
+ goto warn_exit;
+ }
+ return 0;
warn_exit:
+ brelse(*bh);
+ *bh = NULL;
ext4_warning(sb, "Error %d while reading MMP block %llu",
ret, mmp_block);
return ret;
@@ -181,15 +182,13 @@ static int kmmpd(void *data)
EXT4_FEATURE_INCOMPAT_MMP)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- goto failed;
+ goto exit_thread;
}
if (sb->s_flags & MS_RDONLY) {
ext4_warning(sb, "kmmpd being stopped since filesystem "
"has been remounted as readonly.");
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- goto failed;
+ goto exit_thread;
}
diff = jiffies - last_update_time;
@@ -211,9 +210,7 @@ static int kmmpd(void *data)
if (retval) {
ext4_error(sb, "error reading MMP data: %d",
retval);
-
- EXT4_SB(sb)->s_mmp_tsk = NULL;
- goto failed;
+ goto exit_thread;
}
mmp_check = (struct mmp_struct *)(bh_check->b_data);
@@ -225,7 +222,9 @@ static int kmmpd(void *data)
"The filesystem seems to have been"
" multiply mounted.");
ext4_error(sb, "abort");
- goto failed;
+ put_bh(bh_check);
+ retval = -EBUSY;
+ goto exit_thread;
}
put_bh(bh_check);
}
@@ -248,7 +247,8 @@ static int kmmpd(void *data)
retval = write_mmp_block(sb, bh);
-failed:
+exit_thread:
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
kfree(data);
brelse(bh);
return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index fb6f11709ae6..325cef48b39a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -60,10 +60,10 @@ ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
if (first < second) {
down_write(&EXT4_I(first)->i_data_sem);
- down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+ down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER);
} else {
down_write(&EXT4_I(second)->i_data_sem);
- down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
+ down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER);
}
}
@@ -156,7 +156,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
if (!page[1]) {
unlock_page(page[0]);
- page_cache_release(page[0]);
+ put_page(page[0]);
return -ENOMEM;
}
/*
@@ -192,7 +192,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ block = (sector_t)page->index << (PAGE_SHIFT - inode->i_blkbits);
for (bh = head, block_start = 0; bh != head || !block_start;
block++, block_start = block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
@@ -265,11 +265,12 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
ext4_lblk_t orig_blk_offset, donor_blk_offset;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int tmp_data_size, data_size, replaced_size;
- int err2, jblocks, retries = 0;
+ int i, err2, jblocks, retries = 0;
int replaced_count = 0;
int from = data_offset_in_page << orig_inode->i_blkbits;
- int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+ int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
struct super_block *sb = orig_inode->i_sb;
+ struct buffer_head *bh = NULL;
/*
* It needs twice the amount of ordinary journal buffers because
@@ -380,8 +381,17 @@ data_copy:
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
- *err = __block_write_begin(pagep[0], from, replaced_size,
- ext4_get_block);
+ if (!page_has_buffers(pagep[0]))
+ create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
+ bh = page_buffers(pagep[0]);
+ for (i = 0; i < data_offset_in_page; i++)
+ bh = bh->b_this_page;
+ for (i = 0; i < block_len_in_page; i++) {
+ *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
+ if (*err < 0)
+ break;
+ bh = bh->b_this_page;
+ }
if (!*err)
*err = block_commit_write(pagep[0], from, from + replaced_size);
@@ -394,9 +404,9 @@ data_copy:
unlock_pages:
unlock_page(pagep[0]);
- page_cache_release(pagep[0]);
+ put_page(pagep[0]);
unlock_page(pagep[1]);
- page_cache_release(pagep[1]);
+ put_page(pagep[1]);
stop_journal:
ext4_journal_stop(handle);
if (*err == -ENOSPC &&
@@ -474,6 +484,13 @@ mext_check_arguments(struct inode *orig_inode,
return -EBUSY;
}
+ if (IS_NOQUOTA(orig_inode) || IS_NOQUOTA(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should "
+ "not be quota files [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EBUSY;
+ }
+
/* Ext4 move extent supports only extent based file */
if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
@@ -544,7 +561,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
struct inode *orig_inode = file_inode(o_filp);
struct inode *donor_inode = file_inode(d_filp);
struct ext4_ext_path *path = NULL;
- int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+ int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
ext4_lblk_t o_end, o_start = orig_blk;
ext4_lblk_t d_start = donor_blk;
int ret;
@@ -638,9 +655,9 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
if (o_end - o_start < cur_len)
cur_len = o_end - o_start;
- orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
+ orig_page_index = o_start >> (PAGE_SHIFT -
orig_inode->i_blkbits);
- donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
+ donor_page_index = d_start >> (PAGE_SHIFT -
donor_inode->i_blkbits);
offset_in_page = o_start % blocks_per_page;
if (cur_len > blocks_per_page- offset_in_page)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a969ab39f302..48e4b8907826 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -273,7 +273,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir);
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
- struct dentry *dentry, struct inode *inode);
+ struct inode *dir, struct inode *inode);
/* checksumming functions */
void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
@@ -1558,6 +1558,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
struct ext4_dir_entry_2 *de;
struct buffer_head *bh;
+ if (ext4_encrypted_inode(dir)) {
+ int res = ext4_get_encryption_info(dir);
+
+ /*
+ * This should be a properly defined flag for
+ * dentry->d_flags when we uplift this to the VFS.
+ * d_fsdata is set to (void *) 1 if if the dentry is
+ * created while the directory was encrypted and we
+ * don't have access to the key.
+ */
+ dentry->d_fsdata = NULL;
+ if (ext4_encryption_info(dir))
+ dentry->d_fsdata = (void *) 1;
+ d_set_d_op(dentry, &ext4_encrypted_d_ops);
+ if (res && res != -ENOKEY)
+ return ERR_PTR(res);
+ }
+
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -1585,11 +1603,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
return ERR_PTR(-EFSCORRUPTED);
}
if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
- (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)) &&
+ (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!ext4_is_child_context_consistent_with_parent(dir,
inode)) {
+ int nokey = ext4_encrypted_inode(inode) &&
+ !ext4_encryption_info(inode);
+
iput(inode);
+ if (nokey)
+ return ERR_PTR(-ENOKEY);
ext4_warning(inode->i_sb,
"Inconsistent encryption contexts: %lu/%lu\n",
(unsigned long) dir->i_ino,
@@ -1928,10 +1950,9 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
* directory, and adds the dentry to the indexed directory.
*/
static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
- struct dentry *dentry,
+ struct inode *dir,
struct inode *inode, struct buffer_head *bh)
{
- struct inode *dir = d_inode(dentry->d_parent);
struct buffer_head *bh2;
struct dx_root *root;
struct dx_frame frames[2], *frame;
@@ -2086,8 +2107,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return retval;
if (ext4_has_inline_data(dir)) {
- retval = ext4_try_add_inline_entry(handle, &fname,
- dentry, inode);
+ retval = ext4_try_add_inline_entry(handle, &fname, dir, inode);
if (retval < 0)
goto out;
if (retval == 1) {
@@ -2097,7 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
if (is_dx(dir)) {
- retval = ext4_dx_add_entry(handle, &fname, dentry, inode);
+ retval = ext4_dx_add_entry(handle, &fname, dir, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
@@ -2119,7 +2139,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (blocks == 1 && !dx_fallback &&
ext4_has_feature_dir_index(sb)) {
- retval = make_indexed_dir(handle, &fname, dentry,
+ retval = make_indexed_dir(handle, &fname, dir,
inode, bh);
bh = NULL; /* make_indexed_dir releases bh */
goto out;
@@ -2154,12 +2174,11 @@ out:
* Returns 0 for success, or a negative error value
*/
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
- struct dentry *dentry, struct inode *inode)
+ struct inode *dir, struct inode *inode)
{
struct dx_frame frames[2], *frame;
struct dx_entry *entries, *at;
struct buffer_head *bh;
- struct inode *dir = d_inode(dentry->d_parent);
struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de;
int err;
@@ -2756,7 +2775,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
return 0;
WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
- !mutex_is_locked(&inode->i_mutex));
+ !inode_is_locked(inode));
/*
* Exit early if inode already is on orphan list. This is a big speedup
* since we don't have to contend on the global s_orphan_lock.
@@ -2838,7 +2857,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
return 0;
WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
- !mutex_is_locked(&inode->i_mutex));
+ !inode_is_locked(inode));
/* Do this quick check before taking global s_orphan_lock. */
if (list_empty(&ei->i_orphan))
return 0;
@@ -3132,6 +3151,7 @@ static int ext4_symlink(struct inode *dir,
if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
if (!encryption_required)
inode->i_op = &ext4_symlink_inode_operations;
+ inode_nohighmem(inode);
ext4_set_aops(inode);
/*
* We cannot call page_symlink() with transaction started
@@ -3211,6 +3231,12 @@ static int ext4_link(struct dentry *old_dentry,
if (ext4_encrypted_inode(dir) &&
!ext4_is_child_context_consistent_with_parent(dir, inode))
return -EPERM;
+
+ if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
+ (!projid_eq(EXT4_I(dir)->i_projid,
+ EXT4_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
err = dquot_initialize(dir);
if (err)
return err;
@@ -3491,6 +3517,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
int credits;
u8 old_file_type;
+ if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) &&
+ (!projid_eq(EXT4_I(new_dir)->i_projid,
+ EXT4_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
retval = dquot_initialize(old.dir);
if (retval)
return retval;
@@ -3700,6 +3731,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
new.inode)))
return -EPERM;
+ if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
+ !projid_eq(EXT4_I(new_dir)->i_projid,
+ EXT4_I(old_dentry->d_inode)->i_projid)) ||
+ (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) &&
+ !projid_eq(EXT4_I(old_dir)->i_projid,
+ EXT4_I(new_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
retval = dquot_initialize(old.dir);
if (retval)
return retval;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 17fbe3882b8e..e4fc8ea45d78 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -23,6 +23,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/backing-dev.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -52,9 +53,8 @@ void ext4_exit_pageio(void)
*/
static void buffer_io_error(struct buffer_head *bh)
{
- char b[BDEVNAME_SIZE];
- printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
- bdevname(bh->b_bdev, b),
+ printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n",
+ bh->b_bdev,
(unsigned long long)bh->b_blocknr);
}
@@ -129,9 +129,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
WARN_ON(io_end->handle);
- if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
- wake_up_all(ext4_ioend_wq(io_end->inode));
-
for (bio = io_end->bio; bio; bio = next_bio) {
next_bio = bio->bi_private;
ext4_finish_bio(bio);
@@ -140,16 +137,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
kmem_cache_free(io_end_cachep, io_end);
}
-static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
-{
- struct inode *inode = io_end->inode;
-
- io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(inode));
-}
-
/*
* Check a range of space and convert unwritten extents to written. Note that
* we are protected from truncate touching same part of extent tree by the
@@ -266,7 +253,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
{
ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
if (io) {
- atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
INIT_LIST_HEAD(&io->list);
atomic_set(&io->count, 1);
@@ -447,8 +433,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- if (len < PAGE_CACHE_SIZE)
- zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
/*
* In the first loop we prepare and mark buffers to submit. We have to
* mark all buffers in the page before submitting so that
@@ -485,9 +471,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
nr_to_submit) {
- data_page = ext4_encrypt(inode, page);
+ gfp_t gfp_flags = GFP_NOFS;
+
+ retry_encrypt:
+ data_page = ext4_encrypt(inode, page, gfp_flags);
if (IS_ERR(data_page)) {
ret = PTR_ERR(data_page);
+ if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
+ if (io->io_bio) {
+ ext4_io_submit(io);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ }
+ gfp_flags |= __GFP_NOFAIL;
+ goto retry_encrypt;
+ }
data_page = NULL;
goto out;
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index d94af71a4e7f..dc54a4b60eba 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -23,7 +23,7 @@
*
* then this code just gives up and calls the buffer_head-based read function.
* It does handle a page which has holes at the end - that is a common case:
- * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
+ * the end-of-file on blocksize < PAGE_SIZE setups.
*
*/
@@ -140,7 +140,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+ const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
sector_t block_in_file;
sector_t last_block;
@@ -166,14 +166,14 @@ int ext4_mpage_readpages(struct address_space *mapping,
page = list_entry(pages->prev, struct page, lru);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
- GFP_KERNEL & mapping_gfp_mask(mapping)))
+ mapping_gfp_constraint(mapping, GFP_KERNEL)))
goto next_page;
}
if (page_has_buffers(page))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
@@ -217,7 +217,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
set_error_page:
SetPageError(page);
zero_user_segment(page, 0,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
unlock_page(page);
goto next_page;
}
@@ -250,7 +250,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
}
if (first_hole != blocks_per_page) {
zero_user_segment(page, first_hole << blkbits,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
if (first_hole == 0) {
SetPageUptodate(page);
unlock_page(page);
@@ -279,7 +279,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (ext4_encrypted_inode(inode) &&
S_ISREG(inode->i_mode)) {
- ctx = ext4_get_crypto_ctx(inode);
+ ctx = ext4_get_crypto_ctx(inode, GFP_NOFS);
if (IS_ERR(ctx))
goto set_error_page;
}
@@ -319,7 +319,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
unlock_page(page);
next_page:
if (pages)
- page_cache_release(page);
+ put_page(page);
}
BUG_ON(pages && !list_empty(pages));
if (bio)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ad62d7acc315..34038e3598d5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -198,7 +198,7 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
if (flex_gd == NULL)
goto out3;
- if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+ if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data))
goto out2;
flex_gd->count = flexbg_size;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04d0f1b33409..304c712dbe12 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -55,7 +55,6 @@
static struct ext4_lazy_init *ext4_li_info;
static struct mutex ext4_li_mtx;
-static int ext4_mballoc_ready;
static struct ratelimit_state ext4_mount_msg_ratelimit;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
@@ -80,6 +79,36 @@ static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
+/*
+ * Lock ordering
+ *
+ * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
+ * i_mmap_rwsem (inode->i_mmap_rwsem)!
+ *
+ * page fault path:
+ * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
+ * page lock -> i_data_sem (rw)
+ *
+ * buffered write path:
+ * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> transaction start -> page lock ->
+ * i_data_sem (rw)
+ *
+ * truncate:
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ * i_mmap_rwsem (w) -> page lock
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ * transaction start -> i_data_sem (rw)
+ *
+ * direct IO:
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) ->
+ * transaction start -> i_data_sem (rw)
+ *
+ * writepages:
+ * transaction start -> page lock(s) -> i_data_sem (rw)
+ */
+
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
@@ -814,7 +843,6 @@ static void ext4_put_super(struct super_block *sb)
ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
- ext4_xattr_put_super(sb);
if (!(sb->s_flags & MS_RDONLY)) {
ext4_clear_feature_journal_needs_recovery(sb);
@@ -914,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
- atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -958,6 +985,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
+ init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
}
@@ -966,7 +994,7 @@ static int __init init_inodecache(void)
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ext4_inode_cachep == NULL)
return -ENOMEM;
@@ -1061,13 +1089,13 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
return 0;
if (journal)
return jbd2_journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_WAIT);
+ wait & ~__GFP_DIRECT_RECLAIM);
return try_to_free_buffers(page);
}
#ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
-#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+static char *quotatypes[] = INITQFNAMES;
+#define QTYPE2NAME(t) (quotatypes[t])
static int ext4_write_dquot(struct dquot *dquot);
static int ext4_acquire_dquot(struct dquot *dquot);
@@ -1085,6 +1113,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags);
static int ext4_enable_quotas(struct super_block *sb);
+static int ext4_get_next_id(struct super_block *sb, struct kqid *qid);
static struct dquot **ext4_get_dquots(struct inode *inode)
{
@@ -1100,6 +1129,8 @@ static const struct dquot_operations ext4_quota_operations = {
.write_info = ext4_write_info,
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
+ .get_projid = ext4_get_projid,
+ .get_next_id = ext4_get_next_id,
};
static const struct quotactl_ops ext4_qctl_operations = {
@@ -1109,7 +1140,8 @@ static const struct quotactl_ops ext4_qctl_operations = {
.get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
- .set_dqblk = dquot_set_dqblk
+ .set_dqblk = dquot_set_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
};
#endif
@@ -1292,9 +1324,9 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
return -1;
}
if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
- "when QUOTA feature is enabled");
- return -1;
+ ext4_msg(sb, KERN_INFO, "Journaled quota options "
+ "ignored when QUOTA feature is enabled");
+ return 1;
}
qname = match_strdup(args);
if (!qname) {
@@ -1393,9 +1425,9 @@ static const struct mount_opts {
{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2 | MOPT_SET},
+ MOPT_NO_EXT2},
{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
- MOPT_NO_EXT2 | MOPT_CLEAR},
+ MOPT_NO_EXT2},
{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
@@ -1657,18 +1689,26 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
}
if (ext4_has_feature_quota(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Cannot set journaled quota options "
+ ext4_msg(sb, KERN_INFO,
+ "Quota format mount options ignored "
"when QUOTA feature is enabled");
- return -1;
+ return 1;
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
-#ifndef CONFIG_FS_DAX
} else if (token == Opt_dax) {
+#ifdef CONFIG_FS_DAX
+ ext4_msg(sb, KERN_WARNING,
+ "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ sbi->s_mount_opt |= m->mount_opt;
+#else
ext4_msg(sb, KERN_INFO, "dax option not supported");
return -1;
#endif
+ } else if (token == Opt_data_err_abort) {
+ sbi->s_mount_opt |= m->mount_opt;
+ } else if (token == Opt_data_err_ignore) {
+ sbi->s_mount_opt &= ~m->mount_opt;
} else {
if (!args->from)
arg = 1;
@@ -1717,11 +1757,11 @@ static int parse_options(char *options, struct super_block *sb,
#ifdef CONFIG_QUOTA
if (ext4_has_feature_quota(sb) &&
(test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
- ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
- "feature is enabled");
- return 0;
- }
- if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ ext4_msg(sb, KERN_INFO, "Quota feature enabled, usrquota and grpquota "
+ "mount options ignored.");
+ clear_opt(sb, USRQUOTA);
+ clear_opt(sb, GRPQUOTA);
+ } else if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
clear_opt(sb, USRQUOTA);
@@ -1745,7 +1785,7 @@ static int parse_options(char *options, struct super_block *sb,
int blocksize =
BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
- if (blocksize < PAGE_CACHE_SIZE) {
+ if (blocksize < PAGE_SIZE) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"dioread_nolock if block size != PAGE_SIZE");
return 0;
@@ -1878,6 +1918,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
if (nodefs || sbi->s_max_dir_size_kb)
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
+ if (test_opt(sb, DATA_ERR_ABORT))
+ SEQ_OPTS_PUTS("data_err=abort");
ext4_show_quota_options(seq, sb);
return 0;
@@ -2250,10 +2292,10 @@ static void ext4_orphan_cleanup(struct super_block *sb,
__func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
nr_truncates++;
} else {
if (test_opt(sb, DEBUG))
@@ -2522,6 +2564,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
"without CONFIG_QUOTA");
return 0;
}
+ if (ext4_has_feature_project(sb) && !readonly) {
+ ext4_msg(sb, KERN_ERR,
+ "Filesystem with project quota feature cannot be mounted RDWR "
+ "without CONFIG_QUOTA");
+ return 0;
+ }
#endif /* CONFIG_QUOTA */
return 1;
}
@@ -3650,7 +3698,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_qcop = &dquot_quotactl_sysfile_ops;
else
sb->s_qcop = &ext4_qctl_operations;
- sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
@@ -3754,16 +3802,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
no_journal:
- if (ext4_mballoc_ready) {
- sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
- if (!sbi->s_mb_cache) {
- ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
- goto failed_mount_wq;
- }
+ sbi->s_mb_cache = ext4_xattr_create_cache();
+ if (!sbi->s_mb_cache) {
+ ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ goto failed_mount_wq;
}
if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
- (blocksize != PAGE_CACHE_SIZE)) {
+ (blocksize != PAGE_SIZE)) {
ext4_msg(sb, KERN_ERR,
"Unsupported blocksize for fs encryption");
goto failed_mount_wq;
@@ -3985,6 +4031,10 @@ failed_mount4:
if (EXT4_SB(sb)->rsv_conversion_wq)
destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
failed_mount_wq:
+ if (sbi->s_mb_cache) {
+ ext4_xattr_destroy_cache(sbi->s_mb_cache);
+ sbi->s_mb_cache = NULL;
+ }
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -4786,6 +4836,48 @@ restore_opts:
return err;
}
+#ifdef CONFIG_QUOTA
+static int ext4_statfs_project(struct super_block *sb,
+ kprojid_t projid, struct kstatfs *buf)
+{
+ struct kqid qid;
+ struct dquot *dquot;
+ u64 limit;
+ u64 curblock;
+
+ qid = make_kqid_projid(projid);
+ dquot = dqget(sb, qid);
+ if (IS_ERR(dquot))
+ return PTR_ERR(dquot);
+ spin_lock(&dq_data_lock);
+
+ limit = (dquot->dq_dqb.dqb_bsoftlimit ?
+ dquot->dq_dqb.dqb_bsoftlimit :
+ dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+ if (limit && buf->f_blocks > limit) {
+ curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+ buf->f_blocks = limit;
+ buf->f_bfree = buf->f_bavail =
+ (buf->f_blocks > curblock) ?
+ (buf->f_blocks - curblock) : 0;
+ }
+
+ limit = dquot->dq_dqb.dqb_isoftlimit ?
+ dquot->dq_dqb.dqb_isoftlimit :
+ dquot->dq_dqb.dqb_ihardlimit;
+ if (limit && buf->f_files > limit) {
+ buf->f_files = limit;
+ buf->f_ffree =
+ (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
+ (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ }
+
+ spin_unlock(&dq_data_lock);
+ dqput(dquot);
+ return 0;
+}
+#endif
+
static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
@@ -4818,6 +4910,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+#ifdef CONFIG_QUOTA
+ if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
+ sb_has_quota_limits_enabled(sb, PRJQUOTA))
+ ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
+#endif
return 0;
}
@@ -4932,6 +5029,20 @@ static int ext4_quota_on_mount(struct super_block *sb, int type)
EXT4_SB(sb)->s_jquota_fmt, type);
}
+static void lockdep_set_quota_inode(struct inode *inode, int subclass)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ /* The first argument of lockdep_set_subclass has to be
+ * *exactly* the same as the argument to init_rwsem() --- in
+ * this case, in init_once() --- or lockdep gets unhappy
+ * because the name of the lock is set using the
+ * stringification of the argument to init_rwsem().
+ */
+ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */
+ lockdep_set_subclass(&ei->i_data_sem, subclass);
+}
+
/*
* Standard function to be called on quota_on
*/
@@ -4971,8 +5082,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
if (err)
return err;
}
-
- return dquot_quota_on(sb, type, format_id, path);
+ lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
+ err = dquot_quota_on(sb, type, format_id, path);
+ if (err)
+ lockdep_set_quota_inode(path->dentry->d_inode,
+ I_DATA_SEM_NORMAL);
+ return err;
}
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
@@ -4982,7 +5097,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
struct inode *qf_inode;
unsigned long qf_inums[EXT4_MAXQUOTAS] = {
le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
- le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
};
BUG_ON(!ext4_has_feature_quota(sb));
@@ -4998,8 +5114,11 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
+ lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
err = dquot_enable(qf_inode, type, format_id, flags);
iput(qf_inode);
+ if (err)
+ lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
return err;
}
@@ -5010,7 +5129,8 @@ static int ext4_enable_quotas(struct super_block *sb)
int type, err = 0;
unsigned long qf_inums[EXT4_MAXQUOTAS] = {
le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
- le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
};
sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
@@ -5155,6 +5275,17 @@ out:
return len;
}
+static int ext4_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ const struct quota_format_ops *ops;
+
+ if (!sb_has_quota_loaded(sb, qid->type))
+ return -ESRCH;
+ ops = sb_dqopt(sb)->ops[qid->type];
+ if (!ops || !ops->get_next_id)
+ return -ENOSYS;
+ return dquot_get_next_id(sb, qid);
+}
#endif
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
@@ -5230,7 +5361,6 @@ MODULE_ALIAS_FS("ext4");
/* Shared across all ext4 file systems */
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
static int __init ext4_init_fs(void)
{
@@ -5243,10 +5373,8 @@ static int __init ext4_init_fs(void)
/* Build-time check for flags consistency */
ext4_check_flag_values();
- for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
- mutex_init(&ext4__aio_mutex[i]);
+ for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
init_waitqueue_head(&ext4__ioend_wq[i]);
- }
err = ext4_init_es();
if (err)
@@ -5267,8 +5395,6 @@ static int __init ext4_init_fs(void)
err = ext4_init_mballoc();
if (err)
goto out2;
- else
- ext4_mballoc_ready = 1;
err = init_inodecache();
if (err)
goto out1;
@@ -5284,7 +5410,6 @@ out:
unregister_as_ext3();
destroy_inodecache();
out1:
- ext4_mballoc_ready = 0;
ext4_exit_mballoc();
out2:
ext4_exit_sysfs();
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index abe2401ce405..75ed5c2f0c16 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,17 +23,21 @@
#include "xattr.h"
#ifdef CONFIG_EXT4_FS_ENCRYPTION
-static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *ext4_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct page *cpage = NULL;
char *caddr, *paddr = NULL;
struct ext4_str cstr, pstr;
- struct inode *inode = d_inode(dentry);
struct ext4_encrypted_symlink_data *sd;
loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
int res;
u32 plen, max_size = inode->i_sb->s_blocksize;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
res = ext4_get_encryption_info(inode);
if (res)
return ERR_PTR(res);
@@ -45,14 +49,14 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
cpage = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(cpage))
return ERR_CAST(cpage);
- caddr = kmap(cpage);
+ caddr = page_address(cpage);
caddr[size] = 0;
}
/* Symlink is encrypted */
sd = (struct ext4_encrypted_symlink_data *)caddr;
cstr.name = sd->encrypted_path;
- cstr.len = le32_to_cpu(sd->len);
+ cstr.len = le16_to_cpu(sd->len);
if ((cstr.len +
sizeof(struct ext4_encrypted_symlink_data) - 1) >
max_size) {
@@ -75,24 +79,20 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
/* Null-terminate the name */
if (res <= plen)
paddr[res] = '\0';
- if (cpage) {
- kunmap(cpage);
- page_cache_release(cpage);
- }
- return *cookie = paddr;
+ if (cpage)
+ put_page(cpage);
+ set_delayed_call(done, kfree_link, paddr);
+ return paddr;
errout:
- if (cpage) {
- kunmap(cpage);
- page_cache_release(cpage);
- }
+ if (cpage)
+ put_page(cpage);
kfree(paddr);
return ERR_PTR(res);
}
const struct inode_operations ext4_encrypted_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = ext4_encrypted_follow_link,
- .put_link = kfree_put_link,
+ .get_link = ext4_encrypted_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -103,8 +103,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = {
const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -114,7 +113,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 1b57c72f4a00..1420a3c614af 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -358,7 +358,7 @@ static int name##_open(struct inode *inode, struct file *file) \
return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \
} \
\
-const struct file_operations ext4_seq_##name##_fops = { \
+static const struct file_operations ext4_seq_##name##_fops = { \
.owner = THIS_MODULE, \
.open = name##_open, \
.read = seq_read, \
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index 011ba6670d99..c70d06a383e2 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -10,8 +10,10 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
+ down_write(&EXT4_I(inode)->i_mmap_sem);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
+ up_write(&EXT4_I(inode)->i_mmap_sem);
}
/*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 984448c6f5f0..e79bd32b9b79 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -68,10 +68,8 @@
printk("\n"); \
} while (0)
# define ea_bdebug(bh, f...) do { \
- char b[BDEVNAME_SIZE]; \
- printk(KERN_DEBUG "block %s:%lu: ", \
- bdevname(bh->b_bdev, b), \
- (unsigned long) bh->b_blocknr); \
+ printk(KERN_DEBUG "block %pg:%lu: ", \
+ bh->b_bdev, (unsigned long) bh->b_blocknr); \
printk(f); \
printk("\n"); \
} while (0)
@@ -232,6 +230,27 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
return error;
}
+static int
+__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
+ void *end, const char *function, unsigned int line)
+{
+ struct ext4_xattr_entry *entry = IFIRST(header);
+ int error = -EFSCORRUPTED;
+
+ if (((void *) header >= end) ||
+ (header->h_magic != le32_to_cpu(EXT4_XATTR_MAGIC)))
+ goto errout;
+ error = ext4_xattr_check_names(entry, end, entry);
+errout:
+ if (error)
+ __ext4_error_inode(inode, function, line, 0,
+ "corrupted in-inode xattr");
+ return error;
+}
+
+#define xattr_check_inode(inode, header, end) \
+ __xattr_check_inode((inode), (header), (end), __func__, __LINE__)
+
static inline int
ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
{
@@ -343,7 +362,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
header = IHDR(inode, raw_inode);
entry = IFIRST(header);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = ext4_xattr_check_names(entry, end, entry);
+ error = xattr_check_inode(inode, header, end);
if (error)
goto cleanup;
error = ext4_xattr_find_entry(&entry, name_index, name,
@@ -404,20 +423,24 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
const struct xattr_handler *handler =
ext4_xattr_handler(entry->e_name_index);
- if (handler) {
- size_t size = handler->list(dentry, buffer, rest,
- entry->e_name,
- entry->e_name_len,
- handler->flags);
+ if (handler && (!handler->list || handler->list(dentry))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_len = strlen(prefix);
+ size_t size = prefix_len + entry->e_name_len + 1;
+
if (buffer) {
if (size > rest)
return -ERANGE;
- buffer += size;
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
rest -= size;
}
}
- return buffer_size - rest;
+ return buffer_size - rest; /* total size */
}
static int
@@ -475,7 +498,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
+ error = xattr_check_inode(inode, header, end);
if (error)
goto cleanup;
error = ext4_xattr_list_entries(dentry, IFIRST(header),
@@ -543,30 +566,44 @@ static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
- struct mb_cache_entry *ce = NULL;
- int error = 0;
struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
+ u32 hash, ref;
+ int error = 0;
- ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
BUFFER_TRACE(bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bh);
if (error)
goto out;
lock_buffer(bh);
- if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+ hash = le32_to_cpu(BHDR(bh)->h_hash);
+ ref = le32_to_cpu(BHDR(bh)->h_refcount);
+ if (ref == 1) {
ea_bdebug(bh, "refcount now=0; freeing");
- if (ce)
- mb_cache_entry_free(ce);
+ /*
+ * This must happen under buffer lock for
+ * ext4_xattr_block_set() to reliably detect freed block
+ */
+ mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
get_bh(bh);
unlock_buffer(bh);
ext4_free_blocks(handle, inode, bh, 0, 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
} else {
- le32_add_cpu(&BHDR(bh)->h_refcount, -1);
- if (ce)
- mb_cache_entry_release(ce);
+ ref--;
+ BHDR(bh)->h_refcount = cpu_to_le32(ref);
+ if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
+ struct mb_cache_entry *ce;
+
+ ce = mb_cache_entry_get(ext4_mb_cache, hash,
+ bh->b_blocknr);
+ if (ce) {
+ ce->e_reusable = 1;
+ mb_cache_entry_put(ext4_mb_cache, ce);
+ }
+ }
+
/*
* Beware of this ugliness: Releasing of xattr block references
* from different inodes can race and so we have to protect
@@ -788,8 +825,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (i->value && i->value_len > sb->s_blocksize)
return -ENOSPC;
if (s->base) {
- ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
- bs->bh->b_blocknr);
BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bs->bh);
if (error)
@@ -797,10 +832,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
lock_buffer(bs->bh);
if (header(s->base)->h_refcount == cpu_to_le32(1)) {
- if (ce) {
- mb_cache_entry_free(ce);
- ce = NULL;
- }
+ __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);
+
+ /*
+ * This must happen under buffer lock for
+ * ext4_xattr_block_set() to reliably detect modified
+ * block
+ */
+ mb_cache_entry_delete_block(ext4_mb_cache, hash,
+ bs->bh->b_blocknr);
ea_bdebug(bs->bh, "modifying in-place");
error = ext4_xattr_set_entry(i, s);
if (!error) {
@@ -824,10 +864,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
int offset = (char *)s->here - bs->bh->b_data;
unlock_buffer(bs->bh);
- if (ce) {
- mb_cache_entry_release(ce);
- ce = NULL;
- }
ea_bdebug(bs->bh, "cloning");
s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
error = -ENOMEM;
@@ -870,6 +906,8 @@ inserted:
if (new_bh == bs->bh)
ea_bdebug(new_bh, "keeping");
else {
+ u32 ref;
+
/* The old block is released after updating
the inode. */
error = dquot_alloc_block(inode,
@@ -882,9 +920,40 @@ inserted:
if (error)
goto cleanup_dquot;
lock_buffer(new_bh);
- le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
+ /*
+ * We have to be careful about races with
+ * freeing, rehashing or adding references to
+ * xattr block. Once we hold buffer lock xattr
+ * block's state is stable so we can check
+ * whether the block got freed / rehashed or
+ * not. Since we unhash mbcache entry under
+ * buffer lock when freeing / rehashing xattr
+ * block, checking whether entry is still
+ * hashed is reliable. Same rules hold for
+ * e_reusable handling.
+ */
+ if (hlist_bl_unhashed(&ce->e_hash_list) ||
+ !ce->e_reusable) {
+ /*
+ * Undo everything and check mbcache
+ * again.
+ */
+ unlock_buffer(new_bh);
+ dquot_free_block(inode,
+ EXT4_C2B(EXT4_SB(sb),
+ 1));
+ brelse(new_bh);
+ mb_cache_entry_put(ext4_mb_cache, ce);
+ ce = NULL;
+ new_bh = NULL;
+ goto inserted;
+ }
+ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+ BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
+ if (ref >= EXT4_XATTR_REFCOUNT_MAX)
+ ce->e_reusable = 0;
ea_bdebug(new_bh, "reusing; refcount now=%d",
- le32_to_cpu(BHDR(new_bh)->h_refcount));
+ ref);
unlock_buffer(new_bh);
error = ext4_handle_dirty_xattr_block(handle,
inode,
@@ -892,7 +961,8 @@ inserted:
if (error)
goto cleanup_dquot;
}
- mb_cache_entry_release(ce);
+ mb_cache_entry_touch(ext4_mb_cache, ce);
+ mb_cache_entry_put(ext4_mb_cache, ce);
ce = NULL;
} else if (bs->bh && s->base == bs->bh->b_data) {
/* We were modifying this block in-place. */
@@ -957,7 +1027,7 @@ getblk_failed:
cleanup:
if (ce)
- mb_cache_entry_release(ce);
+ mb_cache_entry_put(ext4_mb_cache, ce);
brelse(new_bh);
if (!(bs->bh && s->base == bs->bh->b_data))
kfree(s->base);
@@ -991,8 +1061,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
- error = ext4_xattr_check_names(IFIRST(header), is->s.end,
- IFIRST(header));
+ error = xattr_check_inode(inode, header, is->s.end);
if (error)
return error;
/* Find the named attribute. */
@@ -1068,6 +1137,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
return 0;
}
+static int ext4_xattr_value_same(struct ext4_xattr_search *s,
+ struct ext4_xattr_info *i)
+{
+ void *value;
+
+ if (le32_to_cpu(s->here->e_value_size) != i->value_len)
+ return 0;
+ value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
+ return !memcmp(value, i->value, i->value_len);
+}
+
/*
* ext4_xattr_set_handle()
*
@@ -1144,6 +1224,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
else if (!bs.s.not_found)
error = ext4_xattr_block_set(handle, inode, &i, &bs);
} else {
+ error = 0;
+ /* Xattr value did not change? Save us some work and bail out */
+ if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i))
+ goto cleanup;
+ if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
+ goto cleanup;
+
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (!error && !bs.s.not_found) {
i.value = NULL;
@@ -1289,6 +1376,10 @@ retry:
last = entry;
total_ino = sizeof(struct ext4_xattr_ibody_header);
+ error = xattr_check_inode(inode, header, end);
+ if (error)
+ goto cleanup;
+
free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
if (free >= new_extra_isize) {
entry = IFIRST(header);
@@ -1510,17 +1601,6 @@ cleanup:
}
/*
- * ext4_xattr_put_super()
- *
- * This is called when a file system is unmounted.
- */
-void
-ext4_xattr_put_super(struct super_block *sb)
-{
- mb_cache_shrink(sb->s_bdev);
-}
-
-/*
* ext4_xattr_cache_insert()
*
* Create a new entry in the extended attribute cache, and insert
@@ -1531,26 +1611,19 @@ ext4_xattr_put_super(struct super_block *sb)
static void
ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
{
- __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
- struct mb_cache_entry *ce;
+ struct ext4_xattr_header *header = BHDR(bh);
+ __u32 hash = le32_to_cpu(header->h_hash);
+ int reusable = le32_to_cpu(header->h_refcount) <
+ EXT4_XATTR_REFCOUNT_MAX;
int error;
- ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
- if (!ce) {
- ea_bdebug(bh, "out of memory");
- return;
- }
- error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
+ error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
+ bh->b_blocknr, reusable);
if (error) {
- mb_cache_entry_free(ce);
- if (error == -EBUSY) {
+ if (error == -EBUSY)
ea_bdebug(bh, "already in cache");
- error = 0;
- }
- } else {
+ } else
ea_bdebug(bh, "inserting [%x]", (int)hash);
- mb_cache_entry_release(ce);
- }
}
/*
@@ -1612,33 +1685,20 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
- ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
- hash);
+ ce = mb_cache_entry_find_first(ext4_mb_cache, hash);
while (ce) {
struct buffer_head *bh;
- if (IS_ERR(ce)) {
- if (PTR_ERR(ce) == -EAGAIN)
- goto again;
- break;
- }
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) {
EXT4_ERROR_INODE(inode, "block %lu read error",
(unsigned long) ce->e_block);
- } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
- EXT4_XATTR_REFCOUNT_MAX) {
- ea_idebug(inode, "block %lu refcount %d>=%d",
- (unsigned long) ce->e_block,
- le32_to_cpu(BHDR(bh)->h_refcount),
- EXT4_XATTR_REFCOUNT_MAX);
} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
*pce = ce;
return bh;
}
brelse(bh);
- ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
+ ce = mb_cache_entry_find_next(ext4_mb_cache, ce);
}
return NULL;
}
@@ -1714,9 +1774,9 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
#define HASH_BUCKET_BITS 10
struct mb_cache *
-ext4_xattr_create_cache(char *name)
+ext4_xattr_create_cache(void)
{
- return mb_cache_create(name, HASH_BUCKET_BITS);
+ return mb_cache_create(HASH_BUCKET_BITS);
}
void ext4_xattr_destroy_cache(struct mb_cache *cache)
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ddc0957760ba..69dd3e6566e0 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_
extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
-extern void ext4_xattr_put_super(struct super_block *);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
@@ -124,7 +123,7 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
-extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
#ifdef CONFIG_EXT4_FS_SECURITY
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 95d90e0560f0..3e81bdca071a 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -11,38 +11,20 @@
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
- const size_t total_len = prefix_len + name_len + 1;
-
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
static int
-ext4_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext4_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
name, buffer, size);
}
static int
-ext4_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
@@ -76,7 +58,6 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
const struct xattr_handler ext4_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ext4_xattr_security_list,
.get = ext4_xattr_security_get,
.set = ext4_xattr_security_set,
};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 891ee2ddfbd6..2a3c6f9b8cb8 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -12,40 +12,26 @@
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext4_xattr_trusted_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return capable(CAP_SYS_ADMIN);
}
static int
-ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
+ext4_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
static int
-ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 6ed932b3c043..d152f431e432 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -11,30 +11,17 @@
#include "ext4.h"
#include "xattr.h"
-static size_t
-ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+ext4_xattr_user_list(struct dentry *dentry)
{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!test_opt(dentry->d_sb, XATTR_USER))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list+prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
+ return test_opt(dentry->d_sb, XATTR_USER);
}
static int
-ext4_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext4_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER,
@@ -42,11 +29,10 @@ ext4_xattr_user_get(struct dentry *dentry, const char *name,
}
static int
-ext4_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index b0a9dc929f88..1f8982a957f1 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -1,6 +1,8 @@
config F2FS_FS
tristate "F2FS filesystem support"
depends on BLOCK
+ select CRYPTO
+ select CRYPTO_CRC32
help
F2FS is based on Log-structured File System (LFS), which supports
versatile "flash-friendly" features. The design has been focused on
@@ -76,15 +78,7 @@ config F2FS_FS_ENCRYPTION
bool "F2FS Encryption"
depends on F2FS_FS
depends on F2FS_FS_XATTR
- select CRYPTO_AES
- select CRYPTO_CBC
- select CRYPTO_ECB
- select CRYPTO_XTS
- select CRYPTO_CTS
- select CRYPTO_CTR
- select CRYPTO_SHA256
- select KEYS
- select ENCRYPTED_KEYS
+ select FS_ENCRYPTION
help
Enable encryption of f2fs files and directories. This
feature is similar to ecryptfs, but it is more memory
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 08e101ed914c..ca949ea7c02f 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -7,5 +7,3 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
-f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \
- crypto_key.o crypto_fname.o
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f661d80474be..0955312e5ca0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -39,7 +39,7 @@ repeat:
cond_resched();
goto repeat;
}
- f2fs_wait_on_page_writeback(page, META);
+ f2fs_wait_on_page_writeback(page, META, true);
SetPageUptodate(page);
return page;
}
@@ -56,7 +56,8 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
.sbi = sbi,
.type = META,
.rw = READ_SYNC | REQ_META | REQ_PRIO,
- .blk_addr = index,
+ .old_blkaddr = index,
+ .new_blkaddr = index,
.encrypted_page = NULL,
};
@@ -143,7 +144,6 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync)
{
- block_t prev_blk_addr = 0;
struct page *page;
block_t blkno = start;
struct f2fs_io_info fio = {
@@ -152,10 +152,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
.rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
.encrypted_page = NULL,
};
+ struct blk_plug plug;
if (unlikely(type == META_POR))
fio.rw &= ~REQ_META;
+ blk_start_plug(&plug);
for (; nrpages-- > 0; blkno++) {
if (!is_valid_blkaddr(sbi, blkno, type))
@@ -167,27 +169,24 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
blkno = 0;
/* get nat block addr */
- fio.blk_addr = current_nat_addr(sbi,
+ fio.new_blkaddr = current_nat_addr(sbi,
blkno * NAT_ENTRY_PER_BLOCK);
break;
case META_SIT:
/* get sit block addr */
- fio.blk_addr = current_sit_addr(sbi,
+ fio.new_blkaddr = current_sit_addr(sbi,
blkno * SIT_ENTRY_PER_BLOCK);
- if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
- goto out;
- prev_blk_addr = fio.blk_addr;
break;
case META_SSA:
case META_CP:
case META_POR:
- fio.blk_addr = blkno;
+ fio.new_blkaddr = blkno;
break;
default:
BUG();
}
- page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
+ page = grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr);
if (!page)
continue;
if (PageUptodate(page)) {
@@ -196,11 +195,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
}
fio.page = page;
+ fio.old_blkaddr = fio.new_blkaddr;
f2fs_submit_page_mbio(&fio);
f2fs_put_page(page, 0);
}
out:
f2fs_submit_merged_bio(sbi, META, READ);
+ blk_finish_plug(&plug);
return blkno - start;
}
@@ -232,13 +233,17 @@ static int f2fs_write_meta_page(struct page *page,
if (unlikely(f2fs_cp_error(sbi)))
goto redirty_out;
- f2fs_wait_on_page_writeback(page, META);
write_meta_page(sbi, page);
dec_page_count(sbi, F2FS_DIRTY_META);
- unlock_page(page);
if (wbc->for_reclaim)
+ f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE);
+
+ unlock_page(page);
+
+ if (unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_bio(sbi, META, WRITE);
+
return 0;
redirty_out:
@@ -252,13 +257,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff, written;
- trace_f2fs_writepages(mapping->host, wbc, META);
-
/* collect a number of dirty meta pages and write together */
if (wbc->for_kupdate ||
get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
goto skip_write;
+ trace_f2fs_writepages(mapping->host, wbc, META);
+
/* if mounting is failed, skip writing node pages */
mutex_lock(&sbi->cp_mutex);
diff = nr_pages_to_write(sbi, META, wbc);
@@ -269,6 +274,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
skip_write:
wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
+ trace_f2fs_writepages(mapping->host, wbc, META);
return 0;
}
@@ -276,15 +282,18 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write)
{
struct address_space *mapping = META_MAPPING(sbi);
- pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX;
+ pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
struct pagevec pvec;
long nwritten = 0;
struct writeback_control wbc = {
.for_reclaim = 0,
};
+ struct blk_plug plug;
pagevec_init(&pvec, 0);
+ blk_start_plug(&plug);
+
while (index <= end) {
int i, nr_pages;
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
@@ -296,7 +305,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (prev == LONG_MAX)
+ if (prev == ULONG_MAX)
prev = page->index - 1;
if (nr_to_write != LONG_MAX && page->index != prev + 1) {
pagevec_release(&pvec);
@@ -315,6 +324,9 @@ continue_unlock:
goto continue_unlock;
}
+ f2fs_wait_on_page_writeback(page, META, true);
+
+ BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
@@ -334,6 +346,8 @@ stop:
if (nwritten)
f2fs_submit_merged_bio(sbi, type, WRITE);
+ blk_finish_plug(&plug);
+
return nwritten;
}
@@ -410,13 +424,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
spin_unlock(&im->ino_lock);
}
-void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
/* add new dirty ino entry into list */
__add_ino_entry(sbi, ino, type);
}
-void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
{
/* remove dirty ino entry from list */
__remove_ino_entry(sbi, ino, type);
@@ -434,7 +448,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
return e ? true : false;
}
-void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_ino_entry(struct f2fs_sb_info *sbi)
{
struct ino_entry *e, *tmp;
int i;
@@ -621,7 +635,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
goto invalid_cp1;
crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
- if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+ if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
goto invalid_cp1;
pre_version = cur_cp_version(cp_block);
@@ -636,7 +650,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
goto invalid_cp2;
crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
- if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+ if (!f2fs_crc_valid(sbi, crc, cp_block, crc_offset))
goto invalid_cp2;
cur_version = cur_cp_version(cp_block);
@@ -696,6 +710,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
memcpy(sbi->ckpt, cp_block, blk_size);
+ /* Sanity checking of checkpoint */
+ if (sanity_check_ckpt(sbi))
+ goto fail_no_cp;
+
if (cp_blks <= 1)
goto done;
@@ -722,47 +740,48 @@ fail_no_cp:
return -EINVAL;
}
-static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
+static void __add_dirty_inode(struct inode *inode, enum inode_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
- if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
- return -EEXIST;
+ if (is_inode_flag_set(fi, flag))
+ return;
- set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
- F2FS_I(inode)->dirty_dir = new;
- list_add_tail(&new->list, &sbi->dir_inode_list);
- stat_inc_dirty_dir(sbi);
- return 0;
+ set_inode_flag(fi, flag);
+ list_add_tail(&fi->dirty_list, &sbi->inode_list[type]);
+ stat_inc_dirty_inode(sbi, type);
+}
+
+static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
+
+ if (get_dirty_pages(inode) ||
+ !is_inode_flag_set(F2FS_I(inode), flag))
+ return;
+
+ list_del_init(&fi->dirty_list);
+ clear_inode_flag(fi, flag);
+ stat_dec_dirty_inode(F2FS_I_SB(inode), type);
}
void update_dirty_page(struct inode *inode, struct page *page)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inode_entry *new;
- int ret = 0;
+ enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
!S_ISLNK(inode->i_mode))
return;
- if (!S_ISDIR(inode->i_mode)) {
- inode_inc_dirty_pages(inode);
- goto out;
- }
-
- new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
- new->inode = inode;
- INIT_LIST_HEAD(&new->list);
-
- spin_lock(&sbi->dir_inode_lock);
- ret = __add_dirty_inode(inode, new);
+ spin_lock(&sbi->inode_lock[type]);
+ __add_dirty_inode(inode, type);
inode_inc_dirty_pages(inode);
- spin_unlock(&sbi->dir_inode_lock);
+ spin_unlock(&sbi->inode_lock[type]);
- if (ret)
- kmem_cache_free(inode_entry_slab, new);
-out:
SetPagePrivate(page);
f2fs_trace_pid(page);
}
@@ -770,70 +789,60 @@ out:
void add_dirty_dir_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inode_entry *new =
- f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
- int ret = 0;
- new->inode = inode;
- INIT_LIST_HEAD(&new->list);
-
- spin_lock(&sbi->dir_inode_lock);
- ret = __add_dirty_inode(inode, new);
- spin_unlock(&sbi->dir_inode_lock);
-
- if (ret)
- kmem_cache_free(inode_entry_slab, new);
+ spin_lock(&sbi->inode_lock[DIR_INODE]);
+ __add_dirty_inode(inode, DIR_INODE);
+ spin_unlock(&sbi->inode_lock[DIR_INODE]);
}
-void remove_dirty_dir_inode(struct inode *inode)
+void remove_dirty_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct inode_entry *entry;
-
- if (!S_ISDIR(inode->i_mode))
- return;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
- spin_lock(&sbi->dir_inode_lock);
- if (get_dirty_pages(inode) ||
- !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
- spin_unlock(&sbi->dir_inode_lock);
+ if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
return;
- }
- entry = F2FS_I(inode)->dirty_dir;
- list_del(&entry->list);
- F2FS_I(inode)->dirty_dir = NULL;
- clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
- stat_dec_dirty_dir(sbi);
- spin_unlock(&sbi->dir_inode_lock);
- kmem_cache_free(inode_entry_slab, entry);
+ spin_lock(&sbi->inode_lock[type]);
+ __remove_dirty_inode(inode, type);
+ spin_unlock(&sbi->inode_lock[type]);
/* Only from the recovery routine */
- if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
- clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+ if (is_inode_flag_set(fi, FI_DELAY_IPUT)) {
+ clear_inode_flag(fi, FI_DELAY_IPUT);
iput(inode);
}
}
-void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
+int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
{
struct list_head *head;
- struct inode_entry *entry;
struct inode *inode;
+ struct f2fs_inode_info *fi;
+ bool is_dir = (type == DIR_INODE);
+
+ trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
+ get_pages(sbi, is_dir ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
retry:
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
- spin_lock(&sbi->dir_inode_lock);
+ spin_lock(&sbi->inode_lock[type]);
- head = &sbi->dir_inode_list;
+ head = &sbi->inode_list[type];
if (list_empty(head)) {
- spin_unlock(&sbi->dir_inode_lock);
- return;
+ spin_unlock(&sbi->inode_lock[type]);
+ trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
+ get_pages(sbi, is_dir ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
+ return 0;
}
- entry = list_entry(head->next, struct inode_entry, list);
- inode = igrab(entry->inode);
- spin_unlock(&sbi->dir_inode_lock);
+ fi = list_entry(head->next, struct f2fs_inode_info, dirty_list);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[type]);
if (inode) {
filemap_fdatawrite(inode->i_mapping);
iput(inode);
@@ -868,11 +877,9 @@ retry_flush_dents:
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
- sync_dirty_dir_inodes(sbi);
- if (unlikely(f2fs_cp_error(sbi))) {
- err = -EIO;
+ err = sync_dirty_inodes(sbi, DIR_INODE);
+ if (err)
goto out;
- }
goto retry_flush_dents;
}
@@ -885,10 +892,9 @@ retry_flush_nodes:
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
- sync_node_pages(sbi, 0, &wbc);
- if (unlikely(f2fs_cp_error(sbi))) {
+ err = sync_node_pages(sbi, 0, &wbc);
+ if (err) {
f2fs_unlock_all(sbi);
- err = -EIO;
goto out;
}
goto retry_flush_nodes;
@@ -914,12 +920,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
if (!get_pages(sbi, F2FS_WRITEBACK))
break;
- io_schedule();
+ io_schedule_timeout(5*HZ);
}
finish_wait(&sbi->cp_wait, &wait);
}
-static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@@ -933,6 +939,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
int cp_payload_blks = __cp_payload(sbi);
block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg);
bool invalidate = false;
+ struct super_block *sb = sbi->sb;
+ struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
+ u64 kbytes_written;
/*
* This avoids to conduct wrong roll-forward operations and uses
@@ -945,7 +954,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
while (get_pages(sbi, F2FS_DIRTY_META)) {
sync_meta_pages(sbi, META, LONG_MAX);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
}
next_free_nid(sbi, &last_nid);
@@ -1020,7 +1029,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
- crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
+ crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset));
*((__le32 *)((unsigned char *)ckpt +
le32_to_cpu(ckpt->checksum_offset)))
= cpu_to_le32(crc32);
@@ -1030,7 +1039,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* need to wait for end_io results */
wait_on_all_pages_writeback(sbi);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
/* write out checkpoint buffer at block 0 */
update_meta_page(sbi, ckpt, start_blk++);
@@ -1046,6 +1055,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
write_data_summaries(sbi, start_blk);
start_blk += data_sum_blocks;
+
+ /* Record write statistics in the hot node summary */
+ kbytes_written = sbi->kbytes_written;
+ if (sb->s_bdev->bd_part)
+ kbytes_written += BD_PART_WRITTEN(sbi);
+
+ seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written);
+
if (__remain_node_summaries(cpc->reason)) {
write_node_summaries(sbi, start_blk);
start_blk += NR_CURSEG_NODE_TYPE;
@@ -1058,10 +1075,10 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
wait_on_all_pages_writeback(sbi);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
- filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
- filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
+ filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX);
+ filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX);
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
@@ -1081,22 +1098,25 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
discard_blk);
- release_dirty_inode(sbi);
+ release_ino_entry(sbi);
if (unlikely(f2fs_cp_error(sbi)))
- return;
+ return -EIO;
clear_prefree_segments(sbi, cpc);
clear_sbi_flag(sbi, SBI_IS_DIRTY);
+
+ return 0;
}
/*
* We guarantee that this checkpoint procedure will not fail.
*/
-void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long long ckpt_ver;
+ int err = 0;
mutex_lock(&sbi->cp_mutex);
@@ -1104,21 +1124,24 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
(cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
(cpc->reason == CP_DISCARD && !sbi->discard_blks)))
goto out;
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
goto out;
- if (f2fs_readonly(sbi->sb))
+ }
+ if (f2fs_readonly(sbi->sb)) {
+ err = -EROFS;
goto out;
+ }
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
- if (block_operations(sbi))
+ err = block_operations(sbi);
+ if (err)
goto out;
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
- f2fs_submit_merged_bio(sbi, META, WRITE);
+ f2fs_flush_merged_bios(sbi);
/*
* update checkpoint pack index
@@ -1133,7 +1156,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
flush_sit_entries(sbi, cpc);
/* unlock all the fs_lock[] in do_checkpoint() */
- do_checkpoint(sbi, cpc);
+ err = do_checkpoint(sbi, cpc);
unblock_operations(sbi);
stat_inc_cp_count(sbi->stat_info);
@@ -1143,10 +1166,11 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
"checkpoint: version = %llx", ckpt_ver);
/* do checkpoint periodically */
- sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval);
+ f2fs_update_time(sbi, CP_TIME);
+ trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
mutex_unlock(&sbi->cp_mutex);
- trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
+ return err;
}
void init_ino_entry_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c
deleted file mode 100644
index 4a62ef14e932..000000000000
--- a/fs/f2fs/crypto.c
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * linux/fs/f2fs/crypto.c
- *
- * Copied from linux/fs/ext4/crypto.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility
- *
- * This contains encryption functions for f2fs
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- * Uday Savagaonkar, 2014
- * Encryption policy handling additions
- * Ildar Muslukhov, 2014
- * Remove ext4_encrypted_zeroout(),
- * add f2fs_restore_and_release_control_page()
- * Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
- */
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-#include <keys/user-type.h>
-#include <keys/encrypted-type.h>
-#include <linux/crypto.h>
-#include <linux/ecryptfs.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-#include <linux/f2fs_fs.h>
-#include <linux/ratelimit.h>
-#include <linux/bio.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-/* Encryption added and removed here! (L: */
-
-static unsigned int num_prealloc_crypto_pages = 32;
-static unsigned int num_prealloc_crypto_ctxs = 128;
-
-module_param(num_prealloc_crypto_pages, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_pages,
- "Number of crypto pages to preallocate");
-module_param(num_prealloc_crypto_ctxs, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
- "Number of crypto contexts to preallocate");
-
-static mempool_t *f2fs_bounce_page_pool;
-
-static LIST_HEAD(f2fs_free_crypto_ctxs);
-static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock);
-
-static struct workqueue_struct *f2fs_read_workqueue;
-static DEFINE_MUTEX(crypto_init);
-
-static struct kmem_cache *f2fs_crypto_ctx_cachep;
-struct kmem_cache *f2fs_crypt_info_cachep;
-
-/**
- * f2fs_release_crypto_ctx() - Releases an encryption context
- * @ctx: The encryption context to release.
- *
- * If the encryption context was allocated from the pre-allocated pool, returns
- * it to that pool. Else, frees it.
- *
- * If there's a bounce page in the context, this frees that.
- */
-void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx)
-{
- unsigned long flags;
-
- if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) {
- mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool);
- ctx->w.bounce_page = NULL;
- }
- ctx->w.control_page = NULL;
- if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
- kmem_cache_free(f2fs_crypto_ctx_cachep, ctx);
- } else {
- spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
- list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
- spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
- }
-}
-
-/**
- * f2fs_get_crypto_ctx() - Gets an encryption context
- * @inode: The inode for which we are doing the crypto
- *
- * Allocates and initializes an encryption context.
- *
- * Return: An allocated and initialized encryption context on success; error
- * value or NULL otherwise.
- */
-struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode)
-{
- struct f2fs_crypto_ctx *ctx = NULL;
- unsigned long flags;
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
- if (ci == NULL)
- return ERR_PTR(-ENOKEY);
-
- /*
- * We first try getting the ctx from a free list because in
- * the common case the ctx will have an allocated and
- * initialized crypto tfm, so it's probably a worthwhile
- * optimization. For the bounce page, we first try getting it
- * from the kernel allocator because that's just about as fast
- * as getting it from a list and because a cache of free pages
- * should generally be a "last resort" option for a filesystem
- * to be able to do its job.
- */
- spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
- ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs,
- struct f2fs_crypto_ctx, free_list);
- if (ctx)
- list_del(&ctx->free_list);
- spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
- if (!ctx) {
- ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS);
- if (!ctx)
- return ERR_PTR(-ENOMEM);
- ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
- } else {
- ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
- }
- ctx->flags &= ~F2FS_WRITE_PATH_FL;
- return ctx;
-}
-
-/*
- * Call f2fs_decrypt on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
-{
- struct f2fs_crypto_ctx *ctx =
- container_of(work, struct f2fs_crypto_ctx, r.work);
- struct bio *bio = ctx->r.bio;
- struct bio_vec *bv;
- int i;
-
- bio_for_each_segment_all(bv, bio, i) {
- struct page *page = bv->bv_page;
- int ret = f2fs_decrypt(ctx, page);
-
- if (ret) {
- WARN_ON_ONCE(1);
- SetPageError(page);
- } else
- SetPageUptodate(page);
- unlock_page(page);
- }
- f2fs_release_crypto_ctx(ctx);
- bio_put(bio);
-}
-
-void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio)
-{
- INIT_WORK(&ctx->r.work, completion_pages);
- ctx->r.bio = bio;
- queue_work(f2fs_read_workqueue, &ctx->r.work);
-}
-
-static void f2fs_crypto_destroy(void)
-{
- struct f2fs_crypto_ctx *pos, *n;
-
- list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list)
- kmem_cache_free(f2fs_crypto_ctx_cachep, pos);
- INIT_LIST_HEAD(&f2fs_free_crypto_ctxs);
- if (f2fs_bounce_page_pool)
- mempool_destroy(f2fs_bounce_page_pool);
- f2fs_bounce_page_pool = NULL;
-}
-
-/**
- * f2fs_crypto_initialize() - Set up for f2fs encryption.
- *
- * We only call this when we start accessing encrypted files, since it
- * results in memory getting allocated that wouldn't otherwise be used.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int f2fs_crypto_initialize(void)
-{
- int i, res = -ENOMEM;
-
- if (f2fs_bounce_page_pool)
- return 0;
-
- mutex_lock(&crypto_init);
- if (f2fs_bounce_page_pool)
- goto already_initialized;
-
- for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
- struct f2fs_crypto_ctx *ctx;
-
- ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL);
- if (!ctx)
- goto fail;
- list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
- }
-
- /* must be allocated at the last step to avoid race condition above */
- f2fs_bounce_page_pool =
- mempool_create_page_pool(num_prealloc_crypto_pages, 0);
- if (!f2fs_bounce_page_pool)
- goto fail;
-
-already_initialized:
- mutex_unlock(&crypto_init);
- return 0;
-fail:
- f2fs_crypto_destroy();
- mutex_unlock(&crypto_init);
- return res;
-}
-
-/**
- * f2fs_exit_crypto() - Shutdown the f2fs encryption system
- */
-void f2fs_exit_crypto(void)
-{
- f2fs_crypto_destroy();
-
- if (f2fs_read_workqueue)
- destroy_workqueue(f2fs_read_workqueue);
- if (f2fs_crypto_ctx_cachep)
- kmem_cache_destroy(f2fs_crypto_ctx_cachep);
- if (f2fs_crypt_info_cachep)
- kmem_cache_destroy(f2fs_crypt_info_cachep);
-}
-
-int __init f2fs_init_crypto(void)
-{
- int res = -ENOMEM;
-
- f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0);
- if (!f2fs_read_workqueue)
- goto fail;
-
- f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx,
- SLAB_RECLAIM_ACCOUNT);
- if (!f2fs_crypto_ctx_cachep)
- goto fail;
-
- f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info,
- SLAB_RECLAIM_ACCOUNT);
- if (!f2fs_crypt_info_cachep)
- goto fail;
-
- return 0;
-fail:
- f2fs_exit_crypto();
- return res;
-}
-
-void f2fs_restore_and_release_control_page(struct page **page)
-{
- struct f2fs_crypto_ctx *ctx;
- struct page *bounce_page;
-
- /* The bounce data pages are unmapped. */
- if ((*page)->mapping)
- return;
-
- /* The bounce data page is unmapped. */
- bounce_page = *page;
- ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page);
-
- /* restore control page */
- *page = ctx->w.control_page;
-
- f2fs_restore_control_page(bounce_page);
-}
-
-void f2fs_restore_control_page(struct page *data_page)
-{
- struct f2fs_crypto_ctx *ctx =
- (struct f2fs_crypto_ctx *)page_private(data_page);
-
- set_page_private(data_page, (unsigned long)NULL);
- ClearPagePrivate(data_page);
- unlock_page(data_page);
- f2fs_release_crypto_ctx(ctx);
-}
-
-/**
- * f2fs_crypt_complete() - The completion callback for page encryption
- * @req: The asynchronous encryption request context
- * @res: The result of the encryption operation
- */
-static void f2fs_crypt_complete(struct crypto_async_request *req, int res)
-{
- struct f2fs_completion_result *ecr = req->data;
-
- if (res == -EINPROGRESS)
- return;
- ecr->res = res;
- complete(&ecr->completion);
-}
-
-typedef enum {
- F2FS_DECRYPT = 0,
- F2FS_ENCRYPT,
-} f2fs_direction_t;
-
-static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx,
- struct inode *inode,
- f2fs_direction_t rw,
- pgoff_t index,
- struct page *src_page,
- struct page *dest_page)
-{
- u8 xts_tweak[F2FS_XTS_TWEAK_SIZE];
- struct ablkcipher_request *req = NULL;
- DECLARE_F2FS_COMPLETION_RESULT(ecr);
- struct scatterlist dst, src;
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
- struct crypto_ablkcipher *tfm = ci->ci_ctfm;
- int res = 0;
-
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_request_alloc() failed\n",
- __func__);
- return -ENOMEM;
- }
- ablkcipher_request_set_callback(
- req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- f2fs_crypt_complete, &ecr);
-
- BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index));
- memcpy(xts_tweak, &index, sizeof(index));
- memset(&xts_tweak[sizeof(index)], 0,
- F2FS_XTS_TWEAK_SIZE - sizeof(index));
-
- sg_init_table(&dst, 1);
- sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
- sg_init_table(&src, 1);
- sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
- ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
- xts_tweak);
- if (rw == F2FS_DECRYPT)
- res = crypto_ablkcipher_decrypt(req);
- else
- res = crypto_ablkcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
- ablkcipher_request_free(req);
- if (res) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_ablkcipher_encrypt() returned %d\n",
- __func__, res);
- return res;
- }
- return 0;
-}
-
-static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx)
-{
- ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT);
- if (ctx->w.bounce_page == NULL)
- return ERR_PTR(-ENOMEM);
- ctx->flags |= F2FS_WRITE_PATH_FL;
- return ctx->w.bounce_page;
-}
-
-/**
- * f2fs_encrypt() - Encrypts a page
- * @inode: The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- *
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
- *
- * Called on the page write path. The caller must call
- * f2fs_restore_control_page() on the returned ciphertext page to
- * release the bounce buffer and the encryption context.
- *
- * Return: An allocated page with the encrypted content on success. Else, an
- * error value or NULL.
- */
-struct page *f2fs_encrypt(struct inode *inode,
- struct page *plaintext_page)
-{
- struct f2fs_crypto_ctx *ctx;
- struct page *ciphertext_page = NULL;
- int err;
-
- BUG_ON(!PageLocked(plaintext_page));
-
- ctx = f2fs_get_crypto_ctx(inode);
- if (IS_ERR(ctx))
- return (struct page *)ctx;
-
- /* The encryption operation will require a bounce page. */
- ciphertext_page = alloc_bounce_page(ctx);
- if (IS_ERR(ciphertext_page))
- goto err_out;
-
- ctx->w.control_page = plaintext_page;
- err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index,
- plaintext_page, ciphertext_page);
- if (err) {
- ciphertext_page = ERR_PTR(err);
- goto err_out;
- }
-
- SetPagePrivate(ciphertext_page);
- set_page_private(ciphertext_page, (unsigned long)ctx);
- lock_page(ciphertext_page);
- return ciphertext_page;
-
-err_out:
- f2fs_release_crypto_ctx(ctx);
- return ciphertext_page;
-}
-
-/**
- * f2fs_decrypt() - Decrypts a page in-place
- * @ctx: The encryption context.
- * @page: The page to decrypt. Must be locked.
- *
- * Decrypts page in-place using the ctx encryption context.
- *
- * Called from the read completion callback.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page)
-{
- BUG_ON(!PageLocked(page));
-
- return f2fs_page_crypto(ctx, page->mapping->host,
- F2FS_DECRYPT, page->index, page, page);
-}
-
-/*
- * Convenience function which takes care of allocating and
- * deallocating the encryption context
- */
-int f2fs_decrypt_one(struct inode *inode, struct page *page)
-{
- struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode);
- int ret;
-
- if (IS_ERR(ctx))
- return PTR_ERR(ctx);
- ret = f2fs_decrypt(ctx, page);
- f2fs_release_crypto_ctx(ctx);
- return ret;
-}
-
-bool f2fs_valid_contents_enc_mode(uint32_t mode)
-{
- return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS);
-}
-
-/**
- * f2fs_validate_encryption_key_size() - Validate the encryption key size
- * @mode: The key mode.
- * @size: The key size to validate.
- *
- * Return: The validated key size for @mode. Zero if invalid.
- */
-uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size)
-{
- if (size == f2fs_encryption_key_size(mode))
- return size;
- return 0;
-}
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
deleted file mode 100644
index 5de2d866a25c..000000000000
--- a/fs/f2fs/crypto_key.c
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * linux/fs/f2fs/crypto_key.c
- *
- * Copied from linux/fs/f2fs/crypto_key.c
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption key functions for f2fs
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
- */
-#include <keys/encrypted-type.h>
-#include <keys/user-type.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <uapi/linux/keyctl.h>
-#include <crypto/hash.h>
-#include <linux/f2fs_fs.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-static void derive_crypt_complete(struct crypto_async_request *req, int rc)
-{
- struct f2fs_completion_result *ecr = req->data;
-
- if (rc == -EINPROGRESS)
- return;
-
- ecr->res = rc;
- complete(&ecr->completion);
-}
-
-/**
- * f2fs_derive_key_aes() - Derive a key using AES-128-ECB
- * @deriving_key: Encryption key used for derivatio.
- * @source_key: Source key to which to apply derivation.
- * @derived_key: Derived key.
- *
- * Return: Zero on success; non-zero otherwise.
- */
-static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE],
- char source_key[F2FS_AES_256_XTS_KEY_SIZE],
- char derived_key[F2FS_AES_256_XTS_KEY_SIZE])
-{
- int res = 0;
- struct ablkcipher_request *req = NULL;
- DECLARE_F2FS_COMPLETION_RESULT(ecr);
- struct scatterlist src_sg, dst_sg;
- struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
- 0);
-
- if (IS_ERR(tfm)) {
- res = PTR_ERR(tfm);
- tfm = NULL;
- goto out;
- }
- crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
- req = ablkcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- res = -ENOMEM;
- goto out;
- }
- ablkcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- derive_crypt_complete, &ecr);
- res = crypto_ablkcipher_setkey(tfm, deriving_key,
- F2FS_AES_128_ECB_KEY_SIZE);
- if (res < 0)
- goto out;
-
- sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE);
- sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE);
- ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
- F2FS_AES_256_XTS_KEY_SIZE, NULL);
- res = crypto_ablkcipher_encrypt(req);
- if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
- wait_for_completion(&ecr.completion);
- res = ecr.res;
- }
-out:
- if (req)
- ablkcipher_request_free(req);
- if (tfm)
- crypto_free_ablkcipher(tfm);
- return res;
-}
-
-static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
-{
- if (!ci)
- return;
-
- key_put(ci->ci_keyring_key);
- crypto_free_ablkcipher(ci->ci_ctfm);
- kmem_cache_free(f2fs_crypt_info_cachep, ci);
-}
-
-void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci)
-{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_crypt_info *prev;
-
- if (ci == NULL)
- ci = ACCESS_ONCE(fi->i_crypt_info);
- if (ci == NULL)
- return;
- prev = cmpxchg(&fi->i_crypt_info, ci, NULL);
- if (prev != ci)
- return;
-
- f2fs_free_crypt_info(ci);
-}
-
-int _f2fs_get_encryption_info(struct inode *inode)
-{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_crypt_info *crypt_info;
- char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
- (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
- struct key *keyring_key = NULL;
- struct f2fs_encryption_key *master_key;
- struct f2fs_encryption_context ctx;
- const struct user_key_payload *ukp;
- struct crypto_ablkcipher *ctfm;
- const char *cipher_str;
- char raw_key[F2FS_MAX_KEY_SIZE];
- char mode;
- int res;
-
- res = f2fs_crypto_initialize();
- if (res)
- return res;
-retry:
- crypt_info = ACCESS_ONCE(fi->i_crypt_info);
- if (crypt_info) {
- if (!crypt_info->ci_keyring_key ||
- key_validate(crypt_info->ci_keyring_key) == 0)
- return 0;
- f2fs_free_encryption_info(inode, crypt_info);
- goto retry;
- }
-
- res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
- &ctx, sizeof(ctx), NULL);
- if (res < 0)
- return res;
- else if (res != sizeof(ctx))
- return -EINVAL;
- res = 0;
-
- crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS);
- if (!crypt_info)
- return -ENOMEM;
-
- crypt_info->ci_flags = ctx.flags;
- crypt_info->ci_data_mode = ctx.contents_encryption_mode;
- crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
- crypt_info->ci_ctfm = NULL;
- crypt_info->ci_keyring_key = NULL;
- memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
- sizeof(crypt_info->ci_master_key));
- if (S_ISREG(inode->i_mode))
- mode = crypt_info->ci_data_mode;
- else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- mode = crypt_info->ci_filename_mode;
- else
- BUG();
-
- switch (mode) {
- case F2FS_ENCRYPTION_MODE_AES_256_XTS:
- cipher_str = "xts(aes)";
- break;
- case F2FS_ENCRYPTION_MODE_AES_256_CTS:
- cipher_str = "cts(cbc(aes))";
- break;
- default:
- printk_once(KERN_WARNING
- "f2fs: unsupported key mode %d (ino %u)\n",
- mode, (unsigned) inode->i_ino);
- res = -ENOKEY;
- goto out;
- }
-
- memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX,
- F2FS_KEY_DESC_PREFIX_SIZE);
- sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE,
- "%*phN", F2FS_KEY_DESCRIPTOR_SIZE,
- ctx.master_key_descriptor);
- full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
- (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0';
- keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
- if (IS_ERR(keyring_key)) {
- res = PTR_ERR(keyring_key);
- keyring_key = NULL;
- goto out;
- }
- crypt_info->ci_keyring_key = keyring_key;
- BUG_ON(keyring_key->type != &key_type_logon);
- ukp = user_key_payload(keyring_key);
- if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
- res = -EINVAL;
- goto out;
- }
- master_key = (struct f2fs_encryption_key *)ukp->data;
- BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE !=
- F2FS_KEY_DERIVATION_NONCE_SIZE);
- BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE);
- res = f2fs_derive_key_aes(ctx.nonce, master_key->raw,
- raw_key);
- if (res)
- goto out;
-
- ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
- if (!ctfm || IS_ERR(ctfm)) {
- res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
- printk(KERN_DEBUG
- "%s: error %d (inode %u) allocating crypto tfm\n",
- __func__, res, (unsigned) inode->i_ino);
- goto out;
- }
- crypt_info->ci_ctfm = ctfm;
- crypto_ablkcipher_clear_flags(ctfm, ~0);
- crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
- CRYPTO_TFM_REQ_WEAK_KEY);
- res = crypto_ablkcipher_setkey(ctfm, raw_key,
- f2fs_encryption_key_size(mode));
- if (res)
- goto out;
-
- memzero_explicit(raw_key, sizeof(raw_key));
- if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) {
- f2fs_free_crypt_info(crypt_info);
- goto retry;
- }
- return 0;
-
-out:
- if (res == -ENOKEY && !S_ISREG(inode->i_mode))
- res = 0;
-
- f2fs_free_crypt_info(crypt_info);
- memzero_explicit(raw_key, sizeof(raw_key));
- return res;
-}
-
-int f2fs_has_encryption_key(struct inode *inode)
-{
- struct f2fs_inode_info *fi = F2FS_I(inode);
-
- return (fi->i_crypt_info != NULL);
-}
diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c
deleted file mode 100644
index d4a96af513c2..000000000000
--- a/fs/f2fs/crypto_policy.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * copied from linux/fs/ext4/crypto_policy.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility.
- *
- * This contains encryption policy functions for f2fs with some modifications
- * to support f2fs-specific xattr APIs.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-#include <linux/random.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/f2fs_fs.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-static int f2fs_inode_has_encryption_context(struct inode *inode)
-{
- int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL);
- return (res > 0);
-}
-
-/*
- * check whether the policy is consistent with the encryption context
- * for the inode
- */
-static int f2fs_is_encryption_context_consistent_with_policy(
- struct inode *inode, const struct f2fs_encryption_policy *policy)
-{
- struct f2fs_encryption_context ctx;
- int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
- sizeof(ctx), NULL);
-
- if (res != sizeof(ctx))
- return 0;
-
- return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
- F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
- (ctx.flags == policy->flags) &&
- (ctx.contents_encryption_mode ==
- policy->contents_encryption_mode) &&
- (ctx.filenames_encryption_mode ==
- policy->filenames_encryption_mode));
-}
-
-static int f2fs_create_encryption_context_from_policy(
- struct inode *inode, const struct f2fs_encryption_policy *policy)
-{
- struct f2fs_encryption_context ctx;
-
- ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
- memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
- F2FS_KEY_DESCRIPTOR_SIZE);
-
- if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) {
- printk(KERN_WARNING
- "%s: Invalid contents encryption mode %d\n", __func__,
- policy->contents_encryption_mode);
- return -EINVAL;
- }
-
- if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
- printk(KERN_WARNING
- "%s: Invalid filenames encryption mode %d\n", __func__,
- policy->filenames_encryption_mode);
- return -EINVAL;
- }
-
- if (policy->flags & ~F2FS_POLICY_FLAGS_VALID)
- return -EINVAL;
-
- ctx.contents_encryption_mode = policy->contents_encryption_mode;
- ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
- ctx.flags = policy->flags;
- BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE);
- get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
-
- return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
- sizeof(ctx), NULL, XATTR_CREATE);
-}
-
-int f2fs_process_policy(const struct f2fs_encryption_policy *policy,
- struct inode *inode)
-{
- if (policy->version != 0)
- return -EINVAL;
-
- if (!S_ISDIR(inode->i_mode))
- return -EINVAL;
-
- if (!f2fs_inode_has_encryption_context(inode)) {
- if (!f2fs_empty_dir(inode))
- return -ENOTEMPTY;
- return f2fs_create_encryption_context_from_policy(inode,
- policy);
- }
-
- if (f2fs_is_encryption_context_consistent_with_policy(inode, policy))
- return 0;
-
- printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
- __func__);
- return -EINVAL;
-}
-
-int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy)
-{
- struct f2fs_encryption_context ctx;
- int res;
-
- if (!f2fs_encrypted_inode(inode))
- return -ENODATA;
-
- res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
- &ctx, sizeof(ctx), NULL);
- if (res != sizeof(ctx))
- return -ENODATA;
- if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1)
- return -EINVAL;
-
- policy->version = 0;
- policy->contents_encryption_mode = ctx.contents_encryption_mode;
- policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
- policy->flags = ctx.flags;
- memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
- F2FS_KEY_DESCRIPTOR_SIZE);
- return 0;
-}
-
-int f2fs_is_child_context_consistent_with_parent(struct inode *parent,
- struct inode *child)
-{
- struct f2fs_crypt_info *parent_ci, *child_ci;
- int res;
-
- if ((parent == NULL) || (child == NULL)) {
- pr_err("parent %p child %p\n", parent, child);
- BUG_ON(1);
- }
-
- /* no restrictions if the parent directory is not encrypted */
- if (!f2fs_encrypted_inode(parent))
- return 1;
- /* if the child directory is not encrypted, this is always a problem */
- if (!f2fs_encrypted_inode(child))
- return 0;
- res = f2fs_get_encryption_info(parent);
- if (res)
- return 0;
- res = f2fs_get_encryption_info(child);
- if (res)
- return 0;
- parent_ci = F2FS_I(parent)->i_crypt_info;
- child_ci = F2FS_I(child)->i_crypt_info;
- if (!parent_ci && !child_ci)
- return 1;
- if (!parent_ci || !child_ci)
- return 0;
-
- return (memcmp(parent_ci->ci_master_key,
- child_ci->ci_master_key,
- F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
- (parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
- (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
- (parent_ci->ci_flags == child_ci->ci_flags));
-}
-
-/**
- * f2fs_inherit_context() - Sets a child context from its parent
- * @parent: Parent inode from which the context is inherited.
- * @child: Child inode that inherits the context from @parent.
- *
- * Return: Zero on success, non-zero otherwise
- */
-int f2fs_inherit_context(struct inode *parent, struct inode *child,
- struct page *ipage)
-{
- struct f2fs_encryption_context ctx;
- struct f2fs_crypt_info *ci;
- int res;
-
- res = f2fs_get_encryption_info(parent);
- if (res < 0)
- return res;
-
- ci = F2FS_I(parent)->i_crypt_info;
- BUG_ON(ci == NULL);
-
- ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-
- ctx.contents_encryption_mode = ci->ci_data_mode;
- ctx.filenames_encryption_mode = ci->ci_filename_mode;
- ctx.flags = ci->ci_flags;
- memcpy(ctx.master_key_descriptor, ci->ci_master_key,
- F2FS_KEY_DESCRIPTOR_SIZE);
-
- get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
- return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION,
- F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
- sizeof(ctx), ipage, XATTR_CREATE);
-}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 972eab7ac071..5dafb9cef12e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -34,9 +34,9 @@ static void f2fs_read_end_io(struct bio *bio)
if (f2fs_bio_encrypted(bio)) {
if (bio->bi_error) {
- f2fs_release_crypto_ctx(bio->bi_private);
+ fscrypt_release_ctx(bio->bi_private);
} else {
- f2fs_end_io_crypto_work(bio->bi_private, bio);
+ fscrypt_decrypt_bio_pages(bio->bi_private, bio);
return;
}
}
@@ -64,10 +64,9 @@ static void f2fs_write_end_io(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
- f2fs_restore_and_release_control_page(&page);
+ fscrypt_pullback_bio_page(&page, true);
if (unlikely(bio->bi_error)) {
- set_page_dirty(page);
set_bit(AS_EIO, &page->mapping->flags);
f2fs_stop_checkpoint(sbi);
}
@@ -75,8 +74,7 @@ static void f2fs_write_end_io(struct bio *bio)
dec_page_count(sbi, F2FS_WRITEBACK);
}
- if (!get_pages(sbi, F2FS_WRITEBACK) &&
- !list_empty(&sbi->cp_wait.task_list))
+ if (!get_pages(sbi, F2FS_WRITEBACK) && wq_has_sleeper(&sbi->cp_wait))
wake_up(&sbi->cp_wait);
bio_put(bio);
@@ -116,8 +114,54 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
io->bio = NULL;
}
-void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
- enum page_type type, int rw)
+static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
+ struct page *page, nid_t ino)
+{
+ struct bio_vec *bvec;
+ struct page *target;
+ int i;
+
+ if (!io->bio)
+ return false;
+
+ if (!inode && !page && !ino)
+ return true;
+
+ bio_for_each_segment_all(bvec, io->bio, i) {
+
+ if (bvec->bv_page->mapping)
+ target = bvec->bv_page;
+ else
+ target = fscrypt_control_page(bvec->bv_page);
+
+ if (inode && inode == target->mapping->host)
+ return true;
+ if (page && page == target)
+ return true;
+ if (ino && ino == ino_of_node(target))
+ return true;
+ }
+
+ return false;
+}
+
+static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode,
+ struct page *page, nid_t ino,
+ enum page_type type)
+{
+ enum page_type btype = PAGE_TYPE_OF_BIO(type);
+ struct f2fs_bio_info *io = &sbi->write_io[btype];
+ bool ret;
+
+ down_read(&io->io_rwsem);
+ ret = __has_merged_page(io, inode, page, ino);
+ up_read(&io->io_rwsem);
+ return ret;
+}
+
+static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type, int rw)
{
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io;
@@ -126,6 +170,9 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
down_write(&io->io_rwsem);
+ if (!__has_merged_page(io, inode, page, ino))
+ goto out;
+
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
io->fio.type = META_FLUSH;
@@ -135,9 +182,31 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
}
__submit_merged_bio(io);
+out:
up_write(&io->io_rwsem);
}
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type,
+ int rw)
+{
+ __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw);
+}
+
+void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi,
+ struct inode *inode, struct page *page,
+ nid_t ino, enum page_type type, int rw)
+{
+ if (has_merged_page(sbi, inode, page, ino, type))
+ __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw);
+}
+
+void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
+{
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ f2fs_submit_merged_bio(sbi, NODE, WRITE);
+ f2fs_submit_merged_bio(sbi, META, WRITE);
+}
+
/*
* Fill the locked page with data located in the block address.
* Return unlocked page.
@@ -145,15 +214,16 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
int f2fs_submit_page_bio(struct f2fs_io_info *fio)
{
struct bio *bio;
- struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+ struct page *page = fio->encrypted_page ?
+ fio->encrypted_page : fio->page;
trace_f2fs_submit_page_bio(page, fio);
f2fs_trace_ios(fio, 0);
/* Allocate a new bio */
- bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw));
+ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw));
- if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
bio_put(bio);
return -EFAULT;
}
@@ -172,33 +242,36 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
io = is_read ? &sbi->read_io : &sbi->write_io[btype];
- verify_block_addr(sbi, fio->blk_addr);
+ if (fio->old_blkaddr != NEW_ADDR)
+ verify_block_addr(sbi, fio->old_blkaddr);
+ verify_block_addr(sbi, fio->new_blkaddr);
down_write(&io->io_rwsem);
if (!is_read)
inc_page_count(sbi, F2FS_WRITEBACK);
- if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 ||
+ if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
io->fio.rw != fio->rw))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
int bio_blocks = MAX_BIO_BLOCKS(sbi);
- io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read);
+ io->bio = __bio_alloc(sbi, fio->new_blkaddr,
+ bio_blocks, is_read);
io->fio = *fio;
}
bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
- if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) <
- PAGE_CACHE_SIZE) {
+ if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
+ PAGE_SIZE) {
__submit_merged_bio(io);
goto alloc_new;
}
- io->last_block_in_bio = fio->blk_addr;
+ io->last_block_in_bio = fio->new_blkaddr;
f2fs_trace_ios(fio, 0);
up_write(&io->io_rwsem);
@@ -218,14 +291,22 @@ void set_data_blkaddr(struct dnode_of_data *dn)
struct page *node_page = dn->node_page;
unsigned int ofs_in_node = dn->ofs_in_node;
- f2fs_wait_on_page_writeback(node_page, NODE);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
rn = F2FS_NODE(node_page);
/* Get physical address of data block */
addr_array = blkaddr_in_node(rn);
addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
- set_page_dirty(node_page);
+ if (set_page_dirty(node_page))
+ dn->node_changed = true;
+}
+
+void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
+{
+ dn->data_blkaddr = blkaddr;
+ set_data_blkaddr(dn);
+ f2fs_update_extent_cache(dn);
}
int reserve_new_block(struct dnode_of_data *dn)
@@ -325,13 +406,13 @@ got_it:
* see, f2fs_add_link -> get_new_data_page -> init_inode_metadata.
*/
if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
unlock_page(page);
return page;
}
- fio.blk_addr = dn.data_blkaddr;
+ fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
fio.page = page;
err = f2fs_submit_page_bio(&fio);
if (err)
@@ -412,7 +493,7 @@ struct page *get_new_data_page(struct inode *inode,
struct page *page;
struct dnode_of_data dn;
int err;
-repeat:
+
page = f2fs_grab_cache_page(mapping, index, true);
if (!page) {
/*
@@ -436,22 +517,21 @@ repeat:
goto got_it;
if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
} else {
f2fs_put_page(page, 1);
- page = get_read_data_page(inode, index, READ_SYNC, true);
+ /* if ipage exists, blkaddr should be NEW_ADDR */
+ f2fs_bug_on(F2FS_I_SB(inode), ipage);
+ page = get_lock_data_page(inode, index, true);
if (IS_ERR(page))
- goto repeat;
-
- /* wait for read completion */
- lock_page(page);
+ return page;
}
got_it:
if (new_i_size && i_size_read(inode) <
- ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) {
- i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT));
+ ((loff_t)(index + 1) << PAGE_SHIFT)) {
+ i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT));
/* Only the directory inode sets new_i_size */
set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
}
@@ -461,7 +541,6 @@ got_it:
static int __allocate_data_block(struct dnode_of_data *dn)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- struct f2fs_inode_info *fi = F2FS_I(dn->inode);
struct f2fs_summary sum;
struct node_info ni;
int seg = CURSEG_WARM_DATA;
@@ -489,72 +568,41 @@ alloc:
set_data_blkaddr(dn);
/* update i_size */
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
dn->ofs_in_node;
- if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
+ if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT))
i_size_write(dn->inode,
- ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT));
-
- /* direct IO doesn't use extent cache to maximize the performance */
- f2fs_drop_largest_extent(dn->inode, fofs);
-
+ ((loff_t)(fofs + 1) << PAGE_SHIFT));
return 0;
}
-static void __allocate_data_blocks(struct inode *inode, loff_t offset,
- size_t count)
+ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dnode_of_data dn;
- u64 start = F2FS_BYTES_TO_BLK(offset);
- u64 len = F2FS_BYTES_TO_BLK(count);
- bool allocated;
- u64 end_offset;
-
- while (len) {
- f2fs_balance_fs(sbi);
- f2fs_lock_op(sbi);
-
- /* When reading holes, we need its node page */
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- if (get_dnode_of_data(&dn, start, ALLOC_NODE))
- goto out;
-
- allocated = false;
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
-
- while (dn.ofs_in_node < end_offset && len) {
- block_t blkaddr;
-
- if (unlikely(f2fs_cp_error(sbi)))
- goto sync_out;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct f2fs_map_blocks map;
+ ssize_t ret = 0;
- blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
- if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
- if (__allocate_data_block(&dn))
- goto sync_out;
- allocated = true;
- }
- len--;
- start++;
- dn.ofs_in_node++;
- }
+ map.m_lblk = F2FS_BYTES_TO_BLK(iocb->ki_pos);
+ map.m_len = F2FS_BLK_ALIGN(iov_iter_count(from));
+ map.m_next_pgofs = NULL;
- if (allocated)
- sync_inode_page(&dn);
+ if (f2fs_encrypted_inode(inode))
+ return 0;
- f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+ return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
}
- return;
-
-sync_out:
- if (allocated)
- sync_inode_page(&dn);
- f2fs_put_dnode(&dn);
-out:
- f2fs_unlock_op(sbi);
- return;
+ if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
+ }
+ if (!f2fs_has_inline_data(inode))
+ return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+ return ret;
}
/*
@@ -566,7 +614,7 @@ out:
* b. do not use extent cache for better performance
* c. give the block addresses to blockdev
*/
-static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag)
{
unsigned int maxblocks = map->m_len;
@@ -577,6 +625,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int err = 0, ofs = 1;
struct extent_info ei;
bool allocated = false;
+ block_t blkaddr;
map->m_len = 0;
map->m_flags = 0;
@@ -584,138 +633,129 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
/* it only supports block size == page size */
pgofs = (pgoff_t)map->m_lblk;
- if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+ if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
map->m_pblk = ei.blk + pgofs - ei.fofs;
map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
map->m_flags = F2FS_MAP_MAPPED;
goto out;
}
+next_dnode:
if (create)
- f2fs_lock_op(F2FS_I_SB(inode));
+ f2fs_lock_op(sbi);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, mode);
if (err) {
- if (err == -ENOENT)
+ if (err == -ENOENT) {
err = 0;
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs =
+ get_next_page_offset(&dn, pgofs);
+ }
goto unlock_out;
}
- if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) {
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+
+next_block:
+ blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
if (create) {
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
- goto put_out;
+ goto sync_out;
+ }
+ if (flag == F2FS_GET_BLOCK_PRE_AIO) {
+ if (blkaddr == NULL_ADDR)
+ err = reserve_new_block(&dn);
+ } else {
+ err = __allocate_data_block(&dn);
}
- err = __allocate_data_block(&dn);
if (err)
- goto put_out;
+ goto sync_out;
allocated = true;
map->m_flags = F2FS_MAP_NEW;
+ blkaddr = dn.data_blkaddr;
} else {
+ if (flag == F2FS_GET_BLOCK_FIEMAP &&
+ blkaddr == NULL_ADDR) {
+ if (map->m_next_pgofs)
+ *map->m_next_pgofs = pgofs + 1;
+ }
if (flag != F2FS_GET_BLOCK_FIEMAP ||
- dn.data_blkaddr != NEW_ADDR) {
+ blkaddr != NEW_ADDR) {
if (flag == F2FS_GET_BLOCK_BMAP)
err = -ENOENT;
- goto put_out;
+ goto sync_out;
}
-
- /*
- * preallocated unwritten block should be mapped
- * for fiemap.
- */
- if (dn.data_blkaddr == NEW_ADDR)
- map->m_flags = F2FS_MAP_UNWRITTEN;
}
}
- map->m_flags |= F2FS_MAP_MAPPED;
- map->m_pblk = dn.data_blkaddr;
- map->m_len = 1;
+ if (map->m_len == 0) {
+ /* preallocated unwritten block should be mapped for fiemap. */
+ if (blkaddr == NEW_ADDR)
+ map->m_flags |= F2FS_MAP_UNWRITTEN;
+ map->m_flags |= F2FS_MAP_MAPPED;
+
+ map->m_pblk = blkaddr;
+ map->m_len = 1;
+ } else if ((map->m_pblk != NEW_ADDR &&
+ blkaddr == (map->m_pblk + ofs)) ||
+ (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
+ flag == F2FS_GET_BLOCK_PRE_DIO ||
+ flag == F2FS_GET_BLOCK_PRE_AIO) {
+ ofs++;
+ map->m_len++;
+ } else {
+ goto sync_out;
+ }
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
dn.ofs_in_node++;
pgofs++;
-get_next:
- if (dn.ofs_in_node >= end_offset) {
+ if (map->m_len < maxblocks) {
+ if (dn.ofs_in_node < end_offset)
+ goto next_block;
+
if (allocated)
sync_inode_page(&dn);
- allocated = false;
f2fs_put_dnode(&dn);
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, mode);
- if (err) {
- if (err == -ENOENT)
- err = 0;
- goto unlock_out;
+ if (create) {
+ f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, allocated);
}
-
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ allocated = false;
+ goto next_dnode;
}
- if (maxblocks > map->m_len) {
- block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
-
- if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
- if (create) {
- if (unlikely(f2fs_cp_error(sbi))) {
- err = -EIO;
- goto sync_out;
- }
- err = __allocate_data_block(&dn);
- if (err)
- goto sync_out;
- allocated = true;
- map->m_flags |= F2FS_MAP_NEW;
- blkaddr = dn.data_blkaddr;
- } else {
- /*
- * we only merge preallocated unwritten blocks
- * for fiemap.
- */
- if (flag != F2FS_GET_BLOCK_FIEMAP ||
- blkaddr != NEW_ADDR)
- goto sync_out;
- }
- }
-
- /* Give more consecutive addresses for the readahead */
- if ((map->m_pblk != NEW_ADDR &&
- blkaddr == (map->m_pblk + ofs)) ||
- (map->m_pblk == NEW_ADDR &&
- blkaddr == NEW_ADDR)) {
- ofs++;
- dn.ofs_in_node++;
- pgofs++;
- map->m_len++;
- goto get_next;
- }
- }
sync_out:
if (allocated)
sync_inode_page(&dn);
-put_out:
f2fs_put_dnode(&dn);
unlock_out:
- if (create)
- f2fs_unlock_op(F2FS_I_SB(inode));
+ if (create) {
+ f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, allocated);
+ }
out:
trace_f2fs_map_blocks(inode, map, err);
return err;
}
static int __get_data_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create, int flag)
+ struct buffer_head *bh, int create, int flag,
+ pgoff_t *next_pgofs)
{
struct f2fs_map_blocks map;
int ret;
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
+ map.m_next_pgofs = next_pgofs;
ret = f2fs_map_blocks(inode, &map, create, flag);
if (!ret) {
@@ -727,23 +767,29 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
}
static int get_data_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create, int flag)
+ struct buffer_head *bh_result, int create, int flag,
+ pgoff_t *next_pgofs)
{
- return __get_data_block(inode, iblock, bh_result, create, flag);
+ return __get_data_block(inode, iblock, bh_result, create,
+ flag, next_pgofs);
}
static int get_data_block_dio(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_DIO);
+ F2FS_GET_BLOCK_DIO, NULL);
}
static int get_data_block_bmap(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
+ /* Block number less than F2FS MAX BLOCKS */
+ if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks))
+ return -EFBIG;
+
return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_BMAP);
+ F2FS_GET_BLOCK_BMAP, NULL);
}
static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -761,10 +807,10 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
{
struct buffer_head map_bh;
sector_t start_blk, last_blk;
- loff_t isize = i_size_read(inode);
+ pgoff_t next_pgofs;
+ loff_t isize;
u64 logical = 0, phys = 0, size = 0;
u32 flags = 0;
- bool past_eof = false, whole_file = false;
int ret = 0;
ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
@@ -777,82 +823,64 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
- if (len >= isize) {
- whole_file = true;
- len = isize;
- }
+ isize = i_size_read(inode);
+ if (start >= isize)
+ goto out;
+
+ if (start + len > isize)
+ len = isize - start;
if (logical_to_blk(inode, len) == 0)
len = blk_to_logical(inode, 1);
start_blk = logical_to_blk(inode, start);
last_blk = logical_to_blk(inode, start + len - 1);
+
next:
memset(&map_bh, 0, sizeof(struct buffer_head));
map_bh.b_size = len;
ret = get_data_block(inode, start_blk, &map_bh, 0,
- F2FS_GET_BLOCK_FIEMAP);
+ F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
if (ret)
goto out;
/* HOLE */
if (!buffer_mapped(&map_bh)) {
- start_blk++;
-
- if (!past_eof && blk_to_logical(inode, start_blk) >= isize)
- past_eof = 1;
-
- if (past_eof && size) {
- flags |= FIEMAP_EXTENT_LAST;
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- } else if (size) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- size = 0;
- }
+ start_blk = next_pgofs;
+ /* Go through holes util pass the EOF */
+ if (blk_to_logical(inode, start_blk) < isize)
+ goto prep_next;
+ /* Found a hole beyond isize means no more extents.
+ * Note that the premise is that filesystems don't
+ * punch holes beyond isize and keep size unchanged.
+ */
+ flags |= FIEMAP_EXTENT_LAST;
+ }
- /* if we have holes up to/past EOF then we're done */
- if (start_blk > last_blk || past_eof || ret)
- goto out;
- } else {
- if (start_blk > last_blk && !whole_file) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- goto out;
- }
+ if (size) {
+ if (f2fs_encrypted_inode(inode))
+ flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
- /*
- * if size != 0 then we know we already have an extent
- * to add, so add it.
- */
- if (size) {
- ret = fiemap_fill_next_extent(fieinfo, logical,
- phys, size, flags);
- if (ret)
- goto out;
- }
+ ret = fiemap_fill_next_extent(fieinfo, logical,
+ phys, size, flags);
+ }
- logical = blk_to_logical(inode, start_blk);
- phys = blk_to_logical(inode, map_bh.b_blocknr);
- size = map_bh.b_size;
- flags = 0;
- if (buffer_unwritten(&map_bh))
- flags = FIEMAP_EXTENT_UNWRITTEN;
+ if (start_blk > last_blk || ret)
+ goto out;
- start_blk += logical_to_blk(inode, size);
+ logical = blk_to_logical(inode, start_blk);
+ phys = blk_to_logical(inode, map_bh.b_blocknr);
+ size = map_bh.b_size;
+ flags = 0;
+ if (buffer_unwritten(&map_bh))
+ flags = FIEMAP_EXTENT_UNWRITTEN;
- /*
- * If we are past the EOF, then we need to make sure as
- * soon as we find a hole that the last extent we found
- * is marked with FIEMAP_EXTENT_LAST
- */
- if (!past_eof && logical + size >= isize)
- past_eof = true;
- }
+ start_blk += logical_to_blk(inode, size);
+
+prep_next:
cond_resched();
if (fatal_signal_pending(current))
ret = -EINTR;
@@ -862,7 +890,7 @@ out:
if (ret == 1)
ret = 0;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -891,6 +919,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_lblk = 0;
map.m_len = 0;
map.m_flags = 0;
+ map.m_next_pgofs = NULL;
for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
@@ -929,7 +958,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_len = last_block - block_in_file;
if (f2fs_map_blocks(inode, &map, 0,
- F2FS_GET_BLOCK_READ))
+ F2FS_GET_BLOCK_READ))
goto set_error_page;
}
got_it:
@@ -942,7 +971,7 @@ got_it:
goto confused;
}
} else {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
unlock_page(page);
goto next_page;
@@ -958,12 +987,12 @@ submit_and_realloc:
bio = NULL;
}
if (bio == NULL) {
- struct f2fs_crypto_ctx *ctx = NULL;
+ struct fscrypt_ctx *ctx = NULL;
if (f2fs_encrypted_inode(inode) &&
S_ISREG(inode->i_mode)) {
- ctx = f2fs_get_crypto_ctx(inode);
+ ctx = fscrypt_get_ctx(inode, GFP_NOFS);
if (IS_ERR(ctx))
goto set_error_page;
@@ -976,7 +1005,7 @@ submit_and_realloc:
min_t(int, nr_pages, BIO_MAX_PAGES));
if (!bio) {
if (ctx)
- f2fs_release_crypto_ctx(ctx);
+ fscrypt_release_ctx(ctx);
goto set_error_page;
}
bio->bi_bdev = bdev;
@@ -992,7 +1021,7 @@ submit_and_realloc:
goto next_page;
set_error_page:
SetPageError(page);
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
unlock_page(page);
goto next_page;
confused:
@@ -1003,7 +1032,7 @@ confused:
unlock_page(page);
next_page:
if (pages)
- page_cache_release(page);
+ put_page(page);
}
BUG_ON(pages && !list_empty(pages));
if (bio)
@@ -1054,23 +1083,33 @@ int do_write_data_page(struct f2fs_io_info *fio)
if (err)
return err;
- fio->blk_addr = dn.data_blkaddr;
+ fio->old_blkaddr = dn.data_blkaddr;
/* This page is already truncated */
- if (fio->blk_addr == NULL_ADDR) {
+ if (fio->old_blkaddr == NULL_ADDR) {
ClearPageUptodate(page);
goto out_writepage;
}
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ gfp_t gfp_flags = GFP_NOFS;
/* wait for GCed encrypted page writeback */
f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
- fio->blk_addr);
-
- fio->encrypted_page = f2fs_encrypt(inode, fio->page);
+ fio->old_blkaddr);
+retry_encrypt:
+ fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
+ gfp_flags);
if (IS_ERR(fio->encrypted_page)) {
err = PTR_ERR(fio->encrypted_page);
+ if (err == -ENOMEM) {
+ /* flush pending ios and wait for a while */
+ f2fs_flush_merged_bios(F2FS_I_SB(inode));
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ gfp_flags |= __GFP_NOFAIL;
+ err = 0;
+ goto retry_encrypt;
+ }
goto out_writepage;
}
}
@@ -1081,16 +1120,15 @@ int do_write_data_page(struct f2fs_io_info *fio)
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (unlikely(fio->blk_addr != NEW_ADDR &&
+ if (unlikely(fio->old_blkaddr != NEW_ADDR &&
!is_cold_data(page) &&
+ !IS_ATOMIC_WRITTEN_PAGE(page) &&
need_inplace_update(inode))) {
rewrite_data_page(fio);
set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
trace_f2fs_do_write_data_page(page, IPU);
} else {
write_data_page(&dn, fio);
- set_data_blkaddr(&dn);
- f2fs_update_extent_cache(&dn);
trace_f2fs_do_write_data_page(page, OPU);
set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
if (page->index == 0)
@@ -1108,7 +1146,7 @@ static int f2fs_write_data_page(struct page *page,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
- >> PAGE_CACHE_SHIFT;
+ >> PAGE_SHIFT;
unsigned offset = 0;
bool need_balance_fs = false;
int err = 0;
@@ -1129,11 +1167,11 @@ static int f2fs_write_data_page(struct page *page,
* If the offset is out-of-range of file size,
* this page does not have to be written to disk.
*/
- offset = i_size & (PAGE_CACHE_SIZE - 1);
+ offset = i_size & (PAGE_SIZE - 1);
if ((page->index >= end_index + 1) || !offset)
goto out;
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
write:
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
@@ -1178,11 +1216,18 @@ out:
inode_dec_dirty_pages(inode);
if (err)
ClearPageUptodate(page);
+
+ if (wbc->for_reclaim) {
+ f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE);
+ remove_dirty_inode(inode);
+ }
+
unlock_page(page);
- if (need_balance_fs)
- f2fs_balance_fs(sbi);
- if (wbc->for_reclaim)
+ f2fs_balance_fs(sbi, need_balance_fs);
+
+ if (unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_bio(sbi, DATA, WRITE);
+
return 0;
redirty_out:
@@ -1232,8 +1277,8 @@ next:
cycled = 0;
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
@@ -1282,7 +1327,8 @@ continue_unlock:
if (PageWriteback(page)) {
if (wbc->sync_mode != WB_SYNC_NONE)
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page,
+ DATA, true);
else
goto continue_unlock;
}
@@ -1339,8 +1385,6 @@ static int f2fs_write_data_pages(struct address_space *mapping,
int ret;
long diff;
- trace_f2fs_writepages(mapping->host, wbc, DATA);
-
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
@@ -1354,41 +1398,124 @@ static int f2fs_write_data_pages(struct address_space *mapping,
available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
+ /* skip writing during file defragment */
+ if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
+ goto skip_write;
+
/* during POR, we don't need to trigger writepage at all. */
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto skip_write;
+ trace_f2fs_writepages(mapping->host, wbc, DATA);
+
diff = nr_pages_to_write(sbi, DATA, wbc);
- if (!S_ISDIR(inode->i_mode)) {
+ if (!S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_ALL) {
mutex_lock(&sbi->writepages);
locked = true;
}
ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
if (locked)
mutex_unlock(&sbi->writepages);
- remove_dirty_dir_inode(inode);
+ remove_dirty_inode(inode);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return ret;
skip_write:
wbc->pages_skipped += get_dirty_pages(inode);
+ trace_f2fs_writepages(mapping->host, wbc, DATA);
return 0;
}
static void f2fs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
+ loff_t i_size = i_size_read(inode);
- if (to > inode->i_size) {
- truncate_pagecache(inode, inode->i_size);
- truncate_blocks(inode, inode->i_size, true);
+ if (to > i_size) {
+ truncate_pagecache(inode, i_size);
+ truncate_blocks(inode, i_size, true);
}
}
+static int prepare_write_begin(struct f2fs_sb_info *sbi,
+ struct page *page, loff_t pos, unsigned len,
+ block_t *blk_addr, bool *node_changed)
+{
+ struct inode *inode = page->mapping->host;
+ pgoff_t index = page->index;
+ struct dnode_of_data dn;
+ struct page *ipage;
+ bool locked = false;
+ struct extent_info ei;
+ int err = 0;
+
+ /*
+ * we already allocated all the blocks, so we don't need to get
+ * the block addresses when there is no need to fill the page.
+ */
+ if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
+ len == PAGE_SIZE)
+ return 0;
+
+ if (f2fs_has_inline_data(inode) ||
+ (pos & PAGE_MASK) >= i_size_read(inode)) {
+ f2fs_lock_op(sbi);
+ locked = true;
+ }
+restart:
+ /* check inline_data */
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto unlock_out;
+ }
+
+ set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+ if (f2fs_has_inline_data(inode)) {
+ if (pos + len <= MAX_INLINE_DATA) {
+ read_inline_data(page, ipage);
+ set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ set_inline_node(ipage);
+ } else {
+ err = f2fs_convert_inline_page(&dn, page);
+ if (err)
+ goto out;
+ if (dn.data_blkaddr == NULL_ADDR)
+ err = f2fs_get_block(&dn, index);
+ }
+ } else if (locked) {
+ err = f2fs_get_block(&dn, index);
+ } else {
+ if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ dn.data_blkaddr = ei.blk + index - ei.fofs;
+ } else {
+ /* hole case */
+ err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ if (err || (!err && dn.data_blkaddr == NULL_ADDR)) {
+ f2fs_put_dnode(&dn);
+ f2fs_lock_op(sbi);
+ locked = true;
+ goto restart;
+ }
+ }
+ }
+
+ /* convert_inline_page can make node_changed */
+ *blk_addr = dn.data_blkaddr;
+ *node_changed = dn.node_changed;
+out:
+ f2fs_put_dnode(&dn);
+unlock_out:
+ if (locked)
+ f2fs_unlock_op(sbi);
+ return err;
+}
+
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1396,15 +1523,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page = NULL;
- struct page *ipage;
- pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
- struct dnode_of_data dn;
+ pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
+ bool need_balance = false;
+ block_t blkaddr = NULL_ADDR;
int err = 0;
trace_f2fs_write_begin(inode, pos, len, flags);
- f2fs_balance_fs(sbi);
-
/*
* We should check this at this moment to avoid deadlock on inode page
* and #0 page. The locking rule for inline_data conversion should be:
@@ -1424,64 +1549,51 @@ repeat:
*pagep = page;
- f2fs_lock_op(sbi);
-
- /* check inline_data */
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
- goto unlock_fail;
- }
-
- set_new_dnode(&dn, inode, ipage, ipage, 0);
+ err = prepare_write_begin(sbi, page, pos, len,
+ &blkaddr, &need_balance);
+ if (err)
+ goto fail;
- if (f2fs_has_inline_data(inode)) {
- if (pos + len <= MAX_INLINE_DATA) {
- read_inline_data(page, ipage);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
- sync_inode_page(&dn);
- goto put_next;
+ if (need_balance && has_not_enough_free_secs(sbi, 0)) {
+ unlock_page(page);
+ f2fs_balance_fs(sbi, true);
+ lock_page(page);
+ if (page->mapping != mapping) {
+ /* The page got truncated from under us */
+ f2fs_put_page(page, 1);
+ goto repeat;
}
- err = f2fs_convert_inline_page(&dn, page);
- if (err)
- goto put_fail;
}
- err = f2fs_get_block(&dn, index);
- if (err)
- goto put_fail;
-put_next:
- f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
-
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, false);
/* wait for GCed encrypted page writeback */
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+ f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
- if (len == PAGE_CACHE_SIZE)
+ if (len == PAGE_SIZE)
goto out_update;
if (PageUptodate(page))
goto out_clear;
- if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ if ((pos & PAGE_MASK) >= i_size_read(inode)) {
+ unsigned start = pos & (PAGE_SIZE - 1);
unsigned end = start + len;
/* Reading beyond i_size is simple: memset to zero */
- zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+ zero_user_segments(page, 0, start, end, PAGE_SIZE);
goto out_update;
}
- if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ if (blkaddr == NEW_ADDR) {
+ zero_user_segment(page, 0, PAGE_SIZE);
} else {
struct f2fs_io_info fio = {
.sbi = sbi,
.type = DATA,
.rw = READ_SYNC,
- .blk_addr = dn.data_blkaddr,
+ .old_blkaddr = blkaddr,
+ .new_blkaddr = blkaddr,
.page = page,
.encrypted_page = NULL,
};
@@ -1501,7 +1613,7 @@ put_next:
/* avoid symlink page */
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
- err = f2fs_decrypt_one(inode, page);
+ err = fscrypt_decrypt_page(page);
if (err)
goto fail;
}
@@ -1512,10 +1624,6 @@ out_clear:
clear_cold_data(page);
return 0;
-put_fail:
- f2fs_put_dnode(&dn);
-unlock_fail:
- f2fs_unlock_op(sbi);
fail:
f2fs_put_page(page, 1);
f2fs_write_failed(mapping, pos + len);
@@ -1536,10 +1644,10 @@ static int f2fs_write_end(struct file *file,
if (pos + copied > i_size_read(inode)) {
i_size_write(inode, pos + copied);
mark_inode_dirty(inode);
- update_inode_page(inode);
}
f2fs_put_page(page, 1);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return copied;
}
@@ -1560,38 +1668,21 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t offset)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
+ struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
int err;
- /* we don't need to use inline_data strictly */
- if (f2fs_has_inline_data(inode)) {
- err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
-
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- return 0;
-
err = check_direct_IO(inode, iter, offset);
if (err)
return err;
- trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ return 0;
- if (iov_iter_rw(iter) == WRITE) {
- __allocate_data_blocks(inode, offset, count);
- if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
- err = -EIO;
- goto out;
- }
- }
+ trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
-out:
if (err < 0 && iov_iter_rw(iter) == WRITE)
f2fs_write_failed(mapping, offset + count);
@@ -1607,7 +1698,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
- (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE))
+ (offset % PAGE_SIZE || length != PAGE_SIZE))
return;
if (PageDirty(page)) {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 478e5d54154f..f4a61a5ff79f 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -38,12 +38,15 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
si->total_ext = atomic64_read(&sbi->total_hit_ext);
- si->ext_tree = sbi->total_ext_tree;
+ si->ext_tree = atomic_read(&sbi->total_ext_tree);
+ si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
si->ext_node = atomic_read(&sbi->total_ext_node);
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
- si->ndirty_dirs = sbi->n_dirty_dirs;
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+ si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+ si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
+ si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
@@ -105,7 +108,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
bimodal = 0;
total_vblocks = 0;
- blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
+ blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
hblks_per_sec = blks_per_sec / 2;
for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
@@ -161,7 +164,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
/* build curseg */
si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
- si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
+ si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE;
/* build dirty segmap */
si->base_mem += sizeof(struct dirty_seglist_info);
@@ -189,18 +192,18 @@ get_cache:
si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
sizeof(struct nat_entry_set);
si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
- si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
for (i = 0; i <= UPDATE_INO; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
- si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree);
+ si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+ sizeof(struct extent_tree);
si->cache_mem += atomic_read(&sbi->total_ext_node) *
sizeof(struct extent_node);
si->page_mem = 0;
npages = NODE_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
npages = META_MAPPING(sbi)->nrpages;
- si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
static int stat_show(struct seq_file *s, void *v)
@@ -211,12 +214,10 @@ static int stat_show(struct seq_file *s, void *v)
mutex_lock(&f2fs_stat_mutex);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
- char devname[BDEVNAME_SIZE];
-
update_general_status(si->sbi);
- seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n",
- bdevname(si->sbi->sb->s_bdev, devname), i++);
+ seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n",
+ si->sbi->sb->s_bdev, i++);
seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
si->sit_area_segs, si->nat_area_segs);
seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -269,7 +270,8 @@ static int stat_show(struct seq_file *s, void *v)
si->dirty_count);
seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
si->prefree_count, si->free_segs, si->free_secs);
- seq_printf(s, "CP calls: %d\n", si->cp_count);
+ seq_printf(s, "CP calls: %d (BG: %d)\n",
+ si->cp_count, si->bg_cp_count);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
@@ -290,8 +292,8 @@ static int stat_show(struct seq_file *s, void *v)
!si->total_ext ? 0 :
div64_u64(si->hit_total * 100, si->total_ext),
si->hit_total, si->total_ext);
- seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n",
- si->ext_tree, si->ext_node);
+ seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
+ si->ext_tree, si->zombie_tree, si->ext_node);
seq_puts(s, "\nBalancing F2FS Async:\n");
seq_printf(s, " - inmem: %4d, wb: %4d\n",
si->inmem_pages, si->wb_pages);
@@ -299,6 +301,8 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d\n",
si->ndirty_dent, si->ndirty_dirs);
+ seq_printf(s, " - datas: %4d in files:%4d\n",
+ si->ndirty_data, si->ndirty_files);
seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
@@ -406,20 +410,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
kfree(si);
}
-void __init f2fs_create_root_stats(void)
+int __init f2fs_create_root_stats(void)
{
struct dentry *file;
f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
if (!f2fs_debugfs_root)
- return;
+ return -ENOMEM;
file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
NULL, &stat_fops);
if (!file) {
debugfs_remove(f2fs_debugfs_root);
f2fs_debugfs_root = NULL;
+ return -ENOMEM;
}
+
+ return 0;
}
void f2fs_destroy_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 7c1678ba8f92..af819571bce7 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -17,8 +17,8 @@
static unsigned long dir_blocks(struct inode *inode)
{
- return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
- >> PAGE_CACHE_SHIFT;
+ return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1))
+ >> PAGE_SHIFT;
}
static unsigned int dir_buckets(unsigned int level, int dir_level)
@@ -77,7 +77,7 @@ static unsigned long dir_block_index(unsigned int level,
}
static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
- struct f2fs_filename *fname,
+ struct fscrypt_name *fname,
f2fs_hash_t namehash,
int *max_slots,
struct page **res_page)
@@ -103,15 +103,15 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
return de;
}
-struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname,
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
f2fs_hash_t namehash, int *max_slots,
struct f2fs_dentry_ptr *d)
{
struct f2fs_dir_entry *de;
unsigned long bit_pos = 0;
int max_len = 0;
- struct f2fs_str de_name = FSTR_INIT(NULL, 0);
- struct f2fs_str *name = &fname->disk_name;
+ struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
+ struct fscrypt_str *name = &fname->disk_name;
if (max_slots)
*max_slots = 0;
@@ -157,7 +157,7 @@ found:
static struct f2fs_dir_entry *find_in_level(struct inode *dir,
unsigned int level,
- struct f2fs_filename *fname,
+ struct fscrypt_name *fname,
struct page **res_page)
{
struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
@@ -172,8 +172,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
namehash = f2fs_dentry_hash(&name);
- f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
-
nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
@@ -220,12 +218,12 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
struct f2fs_dir_entry *de = NULL;
unsigned int max_depth;
unsigned int level;
- struct f2fs_filename fname;
+ struct fscrypt_name fname;
int err;
*res_page = NULL;
- err = f2fs_fname_setup_filename(dir, child, 1, &fname);
+ err = fscrypt_setup_filename(dir, child, 1, &fname);
if (err)
return NULL;
@@ -238,6 +236,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
goto out;
max_depth = F2FS_I(dir)->i_current_depth;
+ if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
+ f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING,
+ "Corrupted max_depth of %lu: %u",
+ dir->i_ino, max_depth);
+ max_depth = MAX_DIR_HASH_DEPTH;
+ F2FS_I(dir)->i_current_depth = max_depth;
+ mark_inode_dirty(dir);
+ }
for (level = 0; level < max_depth; level++) {
de = find_in_level(dir, level, &fname, res_page);
@@ -245,7 +251,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
break;
}
out:
- f2fs_fname_free_filename(&fname);
+ fscrypt_free_filename(&fname);
return de;
}
@@ -290,7 +296,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
{
enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
lock_page(page);
- f2fs_wait_on_page_writeback(page, type);
+ f2fs_wait_on_page_writeback(page, type, true);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode->i_mode);
f2fs_dentry_kunmap(dir, page);
@@ -305,7 +311,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
{
struct f2fs_inode *ri;
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
/* copy name info. to this inode page */
ri = F2FS_INODE(ipage);
@@ -335,24 +341,14 @@ int update_dent_inode(struct inode *inode, struct inode *to,
void do_make_empty_dir(struct inode *inode, struct inode *parent,
struct f2fs_dentry_ptr *d)
{
- struct f2fs_dir_entry *de;
-
- de = &d->dentry[0];
- de->name_len = cpu_to_le16(1);
- de->hash_code = 0;
- de->ino = cpu_to_le32(inode->i_ino);
- memcpy(d->filename[0], ".", 1);
- set_de_type(de, inode->i_mode);
+ struct qstr dot = QSTR_INIT(".", 1);
+ struct qstr dotdot = QSTR_INIT("..", 2);
- de = &d->dentry[1];
- de->hash_code = 0;
- de->name_len = cpu_to_le16(2);
- de->ino = cpu_to_le32(parent->i_ino);
- memcpy(d->filename[1], "..", 2);
- set_de_type(de, parent->i_mode);
+ /* update dirent of "." */
+ f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0);
- test_and_set_bit_le(0, (void *)d->bitmap);
- test_and_set_bit_le(1, (void *)d->bitmap);
+ /* update dirent of ".." */
+ f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1);
}
static int make_empty_dir(struct inode *inode,
@@ -407,7 +403,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
goto put_error;
if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
- err = f2fs_inherit_context(dir, inode, page);
+ err = fscrypt_inherit_context(dir, inode, page, false);
if (err)
goto put_error;
}
@@ -444,7 +440,7 @@ error:
/* once the failed inode becomes a bad inode, i_mode is S_IFREG */
truncate_inode_pages(&inode->i_data, 0);
truncate_blocks(inode, 0, false);
- remove_dirty_dir_inode(inode);
+ remove_dirty_inode(inode);
remove_inode_page(inode);
return ERR_PTR(err);
}
@@ -505,8 +501,12 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
memcpy(d->filename[bit_pos], name->name, name->len);
de->ino = cpu_to_le32(ino);
set_de_type(de, mode);
- for (i = 0; i < slots; i++)
+ for (i = 0; i < slots; i++) {
test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
+ /* avoid wrong garbage data for readdir */
+ if (i)
+ (de + i)->name_len = 0;
+ }
}
/*
@@ -526,11 +526,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
struct f2fs_dentry_block *dentry_blk = NULL;
struct f2fs_dentry_ptr d;
struct page *page = NULL;
- struct f2fs_filename fname;
+ struct fscrypt_name fname;
struct qstr new_name;
int slots, err;
- err = f2fs_fname_setup_filename(dir, name, 0, &fname);
+ err = fscrypt_setup_filename(dir, name, 0, &fname);
if (err)
return err;
@@ -592,7 +592,7 @@ start:
++level;
goto start;
add_dentry:
- f2fs_wait_on_page_writeback(dentry_page, DATA);
+ f2fs_wait_on_page_writeback(dentry_page, DATA, true);
if (inode) {
down_write(&F2FS_I(inode)->i_sem);
@@ -629,7 +629,8 @@ fail:
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
out:
- f2fs_fname_free_filename(&fname);
+ fscrypt_free_filename(&fname);
+ f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
return err;
}
@@ -651,6 +652,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
fail:
up_write(&F2FS_I(inode)->i_sem);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return err;
}
@@ -695,11 +697,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
int i;
+ f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+
if (f2fs_has_inline_dentry(dir))
return f2fs_delete_inline_entry(dentry, page, dir, inode);
lock_page(page);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
dentry_blk = page_address(page);
bit_pos = dentry - dentry_blk->dentry;
@@ -767,12 +771,12 @@ bool f2fs_empty_dir(struct inode *dir)
}
bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
- unsigned int start_pos, struct f2fs_str *fstr)
+ unsigned int start_pos, struct fscrypt_str *fstr)
{
unsigned char d_type = DT_UNKNOWN;
unsigned int bit_pos;
struct f2fs_dir_entry *de = NULL;
- struct f2fs_str de_name = FSTR_INIT(NULL, 0);
+ struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
bit_pos = ((unsigned long)ctx->pos % d->max);
@@ -782,6 +786,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
break;
de = &d->dentry[bit_pos];
+ if (de->name_len == 0) {
+ bit_pos++;
+ ctx->pos = start_pos + bit_pos;
+ continue;
+ }
+
if (de->file_type < F2FS_FT_MAX)
d_type = f2fs_filetype_table[de->file_type];
else
@@ -800,8 +810,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
memcpy(de_name.name, d->filename[bit_pos], de_name.len);
- ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
- &de_name, fstr);
+ ret = fscrypt_fname_disk_to_usr(d->inode,
+ (u32)de->hash_code, 0,
+ &de_name, fstr);
kfree(de_name.name);
if (ret < 0)
return true;
@@ -829,16 +840,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
struct file_ra_state *ra = &file->f_ra;
unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
struct f2fs_dentry_ptr d;
- struct f2fs_str fstr = FSTR_INIT(NULL, 0);
+ struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
int err = 0;
if (f2fs_encrypted_inode(inode)) {
- err = f2fs_get_encryption_info(inode);
- if (err)
+ err = fscrypt_get_encryption_info(inode);
+ if (err && err != -ENOKEY)
return err;
- err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN,
- &fstr);
+ err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
if (err < 0)
return err;
}
@@ -855,36 +865,46 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
for (; n < npages; n++) {
dentry_page = get_lock_data_page(inode, n, false);
- if (IS_ERR(dentry_page))
- continue;
+ if (IS_ERR(dentry_page)) {
+ err = PTR_ERR(dentry_page);
+ if (err == -ENOENT)
+ continue;
+ else
+ goto out;
+ }
dentry_blk = kmap(dentry_page);
make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
- if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr))
- goto stop;
+ if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) {
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ break;
+ }
ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
- dentry_page = NULL;
- }
-stop:
- if (dentry_page && !IS_ERR(dentry_page)) {
- kunmap(dentry_page);
- f2fs_put_page(dentry_page, 1);
}
out:
- f2fs_fname_crypto_free_buffer(&fstr);
+ fscrypt_fname_free_buffer(&fstr);
return err;
}
+static int f2fs_dir_open(struct inode *inode, struct file *filp)
+{
+ if (f2fs_encrypted_inode(inode))
+ return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
+ return 0;
+}
+
const struct file_operations f2fs_dir_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.iterate = f2fs_readdir,
.fsync = f2fs_sync_file,
+ .open = f2fs_dir_open,
.unlocked_ioctl = f2fs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = f2fs_compat_ioctl,
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 7ddba812e11b..c859bb044728 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -33,10 +33,11 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
en->ei = *ei;
INIT_LIST_HEAD(&en->list);
+ en->et = et;
rb_link_node(&en->rb_node, parent, p);
rb_insert_color(&en->rb_node, &et->root);
- et->count++;
+ atomic_inc(&et->node_cnt);
atomic_inc(&sbi->total_ext_node);
return en;
}
@@ -45,11 +46,29 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_node *en)
{
rb_erase(&en->rb_node, &et->root);
- et->count--;
+ atomic_dec(&et->node_cnt);
atomic_dec(&sbi->total_ext_node);
if (et->cached_en == en)
et->cached_en = NULL;
+ kmem_cache_free(extent_node_slab, en);
+}
+
+/*
+ * Flow to release an extent_node:
+ * 1. list_del_init
+ * 2. __detach_extent_node
+ * 3. kmem_cache_free.
+ */
+static void __release_extent_node(struct f2fs_sb_info *sbi,
+ struct extent_tree *et, struct extent_node *en)
+{
+ spin_lock(&sbi->extent_lock);
+ f2fs_bug_on(sbi, list_empty(&en->list));
+ list_del_init(&en->list);
+ spin_unlock(&sbi->extent_lock);
+
+ __detach_extent_node(sbi, et, en);
}
static struct extent_tree *__grab_extent_tree(struct inode *inode)
@@ -68,11 +87,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
et->root = RB_ROOT;
et->cached_en = NULL;
rwlock_init(&et->lock);
- atomic_set(&et->refcount, 0);
- et->count = 0;
- sbi->total_ext_tree++;
+ INIT_LIST_HEAD(&et->list);
+ atomic_set(&et->node_cnt, 0);
+ atomic_inc(&sbi->total_ext_tree);
+ } else {
+ atomic_dec(&sbi->total_zombie_tree);
+ list_del_init(&et->list);
}
- atomic_inc(&et->refcount);
up_write(&sbi->extent_tree_lock);
/* never died until evict_inode */
@@ -127,32 +148,21 @@ static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
}
static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et, bool free_all)
+ struct extent_tree *et)
{
struct rb_node *node, *next;
struct extent_node *en;
- unsigned int count = et->count;
+ unsigned int count = atomic_read(&et->node_cnt);
node = rb_first(&et->root);
while (node) {
next = rb_next(node);
en = rb_entry(node, struct extent_node, rb_node);
-
- if (free_all) {
- spin_lock(&sbi->extent_lock);
- if (!list_empty(&en->list))
- list_del_init(&en->list);
- spin_unlock(&sbi->extent_lock);
- }
-
- if (free_all || list_empty(&en->list)) {
- __detach_extent_node(sbi, et, en);
- kmem_cache_free(extent_node_slab, en);
- }
+ __release_extent_node(sbi, et, en);
node = next;
}
- return count - et->count;
+ return count - atomic_read(&et->node_cnt);
}
static void __drop_largest_extent(struct inode *inode,
@@ -164,34 +174,33 @@ static void __drop_largest_extent(struct inode *inode,
largest->len = 0;
}
-void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs)
-{
- if (!f2fs_may_extent_tree(inode))
- return;
-
- __drop_largest_extent(inode, fofs, 1);
-}
-
-void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+/* return true, if inode page is changed */
+bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree *et;
struct extent_node *en;
struct extent_info ei;
- if (!f2fs_may_extent_tree(inode))
- return;
+ if (!f2fs_may_extent_tree(inode)) {
+ /* drop largest extent */
+ if (i_ext && i_ext->len) {
+ i_ext->len = 0;
+ return true;
+ }
+ return false;
+ }
et = __grab_extent_tree(inode);
- if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
- return;
+ if (!i_ext || !i_ext->len)
+ return false;
set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
write_lock(&et->lock);
- if (et->count)
+ if (atomic_read(&et->node_cnt))
goto out;
en = __init_extent_tree(sbi, et, &ei);
@@ -202,6 +211,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
}
out:
write_unlock(&et->lock);
+ return false;
}
static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
@@ -230,9 +240,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
if (en) {
*ei = en->ei;
spin_lock(&sbi->extent_lock);
- if (!list_empty(&en->list))
+ if (!list_empty(&en->list)) {
list_move_tail(&en->list, &sbi->extent_list);
- et->cached_en = en;
+ et->cached_en = en;
+ }
spin_unlock(&sbi->extent_lock);
ret = true;
}
@@ -327,7 +338,6 @@ lookup_neighbors:
static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_info *ei,
- struct extent_node **den,
struct extent_node *prev_ex,
struct extent_node *next_ex)
{
@@ -340,20 +350,25 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
}
if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
- if (en) {
- __detach_extent_node(sbi, et, prev_ex);
- *den = prev_ex;
- }
+ if (en)
+ __release_extent_node(sbi, et, prev_ex);
next_ex->ei.fofs = ei->fofs;
next_ex->ei.blk = ei->blk;
next_ex->ei.len += ei->len;
en = next_ex;
}
- if (en) {
- __try_update_largest_extent(et, en);
+ if (!en)
+ return NULL;
+
+ __try_update_largest_extent(et, en);
+
+ spin_lock(&sbi->extent_lock);
+ if (!list_empty(&en->list)) {
+ list_move_tail(&en->list, &sbi->extent_list);
et->cached_en = en;
}
+ spin_unlock(&sbi->extent_lock);
return en;
}
@@ -389,7 +404,12 @@ do_insert:
return NULL;
__try_update_largest_extent(et, en);
+
+ /* update in global extent list */
+ spin_lock(&sbi->extent_lock);
+ list_add_tail(&en->list, &sbi->extent_list);
et->cached_en = en;
+ spin_unlock(&sbi->extent_lock);
return en;
}
@@ -477,7 +497,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
if (parts)
__try_update_largest_extent(et, en);
else
- __detach_extent_node(sbi, et, en);
+ __release_extent_node(sbi, et, en);
/*
* if original extent is split into zero or two parts, extent
@@ -488,31 +508,15 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
insert_p = NULL;
insert_parent = NULL;
}
-
- /* update in global extent list */
- spin_lock(&sbi->extent_lock);
- if (!parts && !list_empty(&en->list))
- list_del(&en->list);
- if (en1)
- list_add_tail(&en1->list, &sbi->extent_list);
- spin_unlock(&sbi->extent_lock);
-
- /* release extent node */
- if (!parts)
- kmem_cache_free(extent_node_slab, en);
-
en = next_en;
}
/* 3. update extent in extent cache */
if (blkaddr) {
- struct extent_node *den = NULL;
set_extent_info(&ei, fofs, blkaddr, len);
- en1 = __try_merge_extent_node(sbi, et, &ei, &den,
- prev_en, next_en);
- if (!en1)
- en1 = __insert_extent_tree(sbi, et, &ei,
+ if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+ __insert_extent_tree(sbi, et, &ei,
insert_p, insert_parent);
/* give up extent_cache, if split and small updates happen */
@@ -522,24 +526,10 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
et->largest.len = 0;
set_inode_flag(F2FS_I(inode), FI_NO_EXTENT);
}
-
- spin_lock(&sbi->extent_lock);
- if (en1) {
- if (list_empty(&en1->list))
- list_add_tail(&en1->list, &sbi->extent_list);
- else
- list_move_tail(&en1->list, &sbi->extent_list);
- }
- if (den && !list_empty(&den->list))
- list_del(&den->list);
- spin_unlock(&sbi->extent_lock);
-
- if (den)
- kmem_cache_free(extent_node_slab, den);
}
if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
- __free_extent_tree(sbi, et, true);
+ __free_extent_tree(sbi, et);
write_unlock(&et->lock);
@@ -548,46 +538,42 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
{
- struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
- struct extent_node *en, *tmp;
- unsigned long ino = F2FS_ROOT_INO(sbi);
- struct radix_tree_root *root = &sbi->extent_tree_root;
- unsigned int found;
+ struct extent_tree *et, *next;
+ struct extent_node *en;
unsigned int node_cnt = 0, tree_cnt = 0;
int remained;
if (!test_opt(sbi, EXTENT_CACHE))
return 0;
+ if (!atomic_read(&sbi->total_zombie_tree))
+ goto free_node;
+
if (!down_write_trylock(&sbi->extent_tree_lock))
goto out;
/* 1. remove unreferenced extent tree */
- while ((found = radix_tree_gang_lookup(root,
- (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
- unsigned i;
-
- ino = treevec[found - 1]->ino + 1;
- for (i = 0; i < found; i++) {
- struct extent_tree *et = treevec[i];
-
- if (!atomic_read(&et->refcount)) {
- write_lock(&et->lock);
- node_cnt += __free_extent_tree(sbi, et, true);
- write_unlock(&et->lock);
-
- radix_tree_delete(root, et->ino);
- kmem_cache_free(extent_tree_slab, et);
- sbi->total_ext_tree--;
- tree_cnt++;
-
- if (node_cnt + tree_cnt >= nr_shrink)
- goto unlock_out;
- }
+ list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+ if (atomic_read(&et->node_cnt)) {
+ write_lock(&et->lock);
+ node_cnt += __free_extent_tree(sbi, et);
+ write_unlock(&et->lock);
}
+ f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
+ list_del_init(&et->list);
+ radix_tree_delete(&sbi->extent_tree_root, et->ino);
+ kmem_cache_free(extent_tree_slab, et);
+ atomic_dec(&sbi->total_ext_tree);
+ atomic_dec(&sbi->total_zombie_tree);
+ tree_cnt++;
+
+ if (node_cnt + tree_cnt >= nr_shrink)
+ goto unlock_out;
+ cond_resched();
}
up_write(&sbi->extent_tree_lock);
+free_node:
/* 2. remove LRU extent entries */
if (!down_write_trylock(&sbi->extent_tree_lock))
goto out;
@@ -595,34 +581,29 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
remained = nr_shrink - (node_cnt + tree_cnt);
spin_lock(&sbi->extent_lock);
- list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
- if (!remained--)
+ for (; remained > 0; remained--) {
+ if (list_empty(&sbi->extent_list))
break;
- list_del_init(&en->list);
- }
- spin_unlock(&sbi->extent_lock);
-
- /*
- * reset ino for searching victims from beginning of global extent tree.
- */
- ino = F2FS_ROOT_INO(sbi);
-
- while ((found = radix_tree_gang_lookup(root,
- (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
- unsigned i;
+ en = list_first_entry(&sbi->extent_list,
+ struct extent_node, list);
+ et = en->et;
+ if (!write_trylock(&et->lock)) {
+ /* refresh this extent node's position in extent list */
+ list_move_tail(&en->list, &sbi->extent_list);
+ continue;
+ }
- ino = treevec[found - 1]->ino + 1;
- for (i = 0; i < found; i++) {
- struct extent_tree *et = treevec[i];
+ list_del_init(&en->list);
+ spin_unlock(&sbi->extent_lock);
- write_lock(&et->lock);
- node_cnt += __free_extent_tree(sbi, et, false);
- write_unlock(&et->lock);
+ __detach_extent_node(sbi, et, en);
- if (node_cnt + tree_cnt >= nr_shrink)
- goto unlock_out;
- }
+ write_unlock(&et->lock);
+ node_cnt++;
+ spin_lock(&sbi->extent_lock);
}
+ spin_unlock(&sbi->extent_lock);
+
unlock_out:
up_write(&sbi->extent_tree_lock);
out:
@@ -637,11 +618,11 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
struct extent_tree *et = F2FS_I(inode)->extent_tree;
unsigned int node_cnt = 0;
- if (!et)
+ if (!et || !atomic_read(&et->node_cnt))
return 0;
write_lock(&et->lock);
- node_cnt = __free_extent_tree(sbi, et, true);
+ node_cnt = __free_extent_tree(sbi, et);
write_unlock(&et->lock);
return node_cnt;
@@ -656,8 +637,12 @@ void f2fs_destroy_extent_tree(struct inode *inode)
if (!et)
return;
- if (inode->i_nlink && !is_bad_inode(inode) && et->count) {
- atomic_dec(&et->refcount);
+ if (inode->i_nlink && !is_bad_inode(inode) &&
+ atomic_read(&et->node_cnt)) {
+ down_write(&sbi->extent_tree_lock);
+ list_add_tail(&et->list, &sbi->zombie_list);
+ atomic_inc(&sbi->total_zombie_tree);
+ up_write(&sbi->extent_tree_lock);
return;
}
@@ -666,11 +651,10 @@ void f2fs_destroy_extent_tree(struct inode *inode)
/* delete extent tree entry in radix tree */
down_write(&sbi->extent_tree_lock);
- atomic_dec(&et->refcount);
- f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
+ f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
kmem_cache_free(extent_tree_slab, et);
- sbi->total_ext_tree--;
+ atomic_dec(&sbi->total_ext_tree);
up_write(&sbi->extent_tree_lock);
F2FS_I(inode)->extent_tree = NULL;
@@ -689,19 +673,21 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
void f2fs_update_extent_cache(struct dnode_of_data *dn)
{
- struct f2fs_inode_info *fi = F2FS_I(dn->inode);
pgoff_t fofs;
+ block_t blkaddr;
if (!f2fs_may_extent_tree(dn->inode))
return;
- f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
-
+ if (dn->data_blkaddr == NEW_ADDR)
+ blkaddr = NULL_ADDR;
+ else
+ blkaddr = dn->data_blkaddr;
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
- dn->ofs_in_node;
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+ dn->ofs_in_node;
- if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1))
+ if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1))
sync_inode_page(dn);
}
@@ -722,7 +708,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi)
init_rwsem(&sbi->extent_tree_lock);
INIT_LIST_HEAD(&sbi->extent_list);
spin_lock_init(&sbi->extent_lock);
- sbi->total_ext_tree = 0;
+ atomic_set(&sbi->total_ext_tree, 0);
+ INIT_LIST_HEAD(&sbi->zombie_list);
+ atomic_set(&sbi->total_zombie_tree, 0);
atomic_set(&sbi->total_ext_node, 0);
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9db5500d63d9..7a4558d17f36 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,10 +21,12 @@
#include <linux/sched.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/fscrypto.h>
+#include <crypto/hash.h>
#ifdef CONFIG_F2FS_CHECK_FS
#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
-#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
#else
#define f2fs_bug_on(sbi, condition) \
do { \
@@ -33,7 +35,6 @@
set_sbi_flag(sbi, SBI_NEED_FSCK); \
} \
} while (0)
-#define f2fs_down_write(x, y) down_write(x)
#endif
/*
@@ -54,6 +55,7 @@
#define F2FS_MOUNT_FASTBOOT 0x00001000
#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
+#define F2FS_MOUNT_DATA_FLUSH 0x00008000
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -82,27 +84,6 @@ struct f2fs_mount_info {
#define F2FS_CLEAR_FEATURE(sb, mask) \
F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)
-#define CRCPOLY_LE 0xedb88320
-
-static inline __u32 f2fs_crc32(void *buf, size_t len)
-{
- unsigned char *p = (unsigned char *)buf;
- __u32 crc = F2FS_SUPER_MAGIC;
- int i;
-
- while (len--) {
- crc ^= *p++;
- for (i = 0; i < 8; i++)
- crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0);
- }
- return crc;
-}
-
-static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size)
-{
- return f2fs_crc32(buf, buf_size) == blk_crc;
-}
-
/*
* For checkpoint manager
*/
@@ -125,6 +106,7 @@ enum {
#define BATCHED_TRIM_BLOCKS(sbi) \
(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
#define DEF_CP_INTERVAL 60 /* 60 secs */
+#define DEF_IDLE_INTERVAL 120 /* 2 mins */
struct cp_control {
int reason;
@@ -158,13 +140,7 @@ struct ino_entry {
nid_t ino; /* inode number */
};
-/*
- * for the list of directory inodes or gc inodes.
- * NOTE: there are two slab users for this structure, if we add/modify/delete
- * fields in structure for one of slab users, it may affect fields or size of
- * other one, in this condition, it's better to split both of slab and related
- * data structure.
- */
+/* for the list of inodes to be GCed */
struct inode_entry {
struct list_head list; /* list head */
struct inode *inode; /* vfs inode pointer */
@@ -186,37 +162,37 @@ struct fsync_inode_entry {
block_t last_inode; /* block address locating the last inode */
};
-#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
-#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits))
+#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats))
+#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits))
-#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne)
-#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid)
-#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
-#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
+#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne)
+#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid)
+#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se)
+#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno)
-#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
-#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
+#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl))
+#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl))
-static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
+static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i)
{
- int before = nats_in_cursum(rs);
- rs->n_nats = cpu_to_le16(before + i);
+ int before = nats_in_cursum(journal);
+ journal->n_nats = cpu_to_le16(before + i);
return before;
}
-static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
+static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i)
{
- int before = sits_in_cursum(rs);
- rs->n_sits = cpu_to_le16(before + i);
+ int before = sits_in_cursum(journal);
+ journal->n_sits = cpu_to_le16(before + i);
return before;
}
-static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
- int type)
+static inline bool __has_cursum_space(struct f2fs_journal *journal,
+ int size, int type)
{
if (type == NAT_JOURNAL)
- return size <= MAX_NAT_JENTRIES(sum);
- return size <= MAX_SIT_JENTRIES(sum);
+ return size <= MAX_NAT_JENTRIES(journal);
+ return size <= MAX_SIT_JENTRIES(journal);
}
/*
@@ -234,13 +210,11 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6)
#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
+#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8)
-#define F2FS_IOC_SET_ENCRYPTION_POLICY \
- _IOR('f', 19, struct f2fs_encryption_policy)
-#define F2FS_IOC_GET_ENCRYPTION_PWSALT \
- _IOW('f', 20, __u8[16])
-#define F2FS_IOC_GET_ENCRYPTION_POLICY \
- _IOW('f', 21, struct f2fs_encryption_policy)
+#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
+#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
+#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT
/*
* should be same as XFS_IOC_GOINGDOWN.
@@ -256,33 +230,20 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
/*
* ioctl commands in 32 bit emulation
*/
-#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
-#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS
+#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION
#endif
+struct f2fs_defragment {
+ u64 start;
+ u64 len;
+};
+
/*
* For INODE and NODE manager
*/
/* for directory operations */
-struct f2fs_str {
- unsigned char *name;
- u32 len;
-};
-
-struct f2fs_filename {
- const struct qstr *usr_fname;
- struct f2fs_str disk_name;
- f2fs_hash_t hash;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- struct f2fs_str crypto_buf;
-#endif
-};
-
-#define FSTR_INIT(n, l) { .name = n, .len = l }
-#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p) ((p)->disk_name.name)
-#define fname_len(p) ((p)->disk_name.len)
-
struct f2fs_dentry_ptr {
struct inode *inode;
const void *bitmap;
@@ -350,6 +311,7 @@ struct extent_node {
struct rb_node rb_node; /* rb node located in rb-tree */
struct list_head list; /* node in global extent list of sbi */
struct extent_info ei; /* extent info */
+ struct extent_tree *et; /* extent tree pointer */
};
struct extent_tree {
@@ -357,9 +319,9 @@ struct extent_tree {
struct rb_root root; /* root of extent info rb-tree */
struct extent_node *cached_en; /* recently accessed extent node */
struct extent_info largest; /* largested extent info */
+ struct list_head list; /* to be used by sbi->zombie_list */
rwlock_t lock; /* protect extent info rb-tree */
- atomic_t refcount; /* reference count of rb-tree */
- unsigned int count; /* # of extent node in rb-tree*/
+ atomic_t node_cnt; /* # of extent node in rb-tree*/
};
/*
@@ -378,6 +340,7 @@ struct f2fs_map_blocks {
block_t m_lblk;
unsigned int m_len;
unsigned int m_flags;
+ pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */
};
/* for flag in get_data_block */
@@ -385,6 +348,8 @@ struct f2fs_map_blocks {
#define F2FS_GET_BLOCK_DIO 1
#define F2FS_GET_BLOCK_FIEMAP 2
#define F2FS_GET_BLOCK_BMAP 3
+#define F2FS_GET_BLOCK_PRE_DIO 4
+#define F2FS_GET_BLOCK_PRE_AIO 5
/*
* i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
@@ -406,15 +371,6 @@ struct f2fs_map_blocks {
#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT)
#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
-/* Encryption algorithms */
-#define F2FS_ENCRYPTION_MODE_INVALID 0
-#define F2FS_ENCRYPTION_MODE_AES_256_XTS 1
-#define F2FS_ENCRYPTION_MODE_AES_256_GCM 2
-#define F2FS_ENCRYPTION_MODE_AES_256_CBC 3
-#define F2FS_ENCRYPTION_MODE_AES_256_CTS 4
-
-#include "f2fs_crypto.h"
-
#define DEF_DIR_LEVEL 0
struct f2fs_inode_info {
@@ -434,17 +390,11 @@ struct f2fs_inode_info {
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
- struct inode_entry *dirty_dir; /* the pointer of dirty dir */
+ struct list_head dirty_list; /* linked in global dirty list */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
struct mutex inmem_lock; /* lock for inmemory pages */
-
struct extent_tree *extent_tree; /* cached extent_tree entry */
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- /* Encryption params */
- struct f2fs_crypt_info *i_crypt_info;
-#endif
};
static inline void get_extent_info(struct extent_info *ext,
@@ -511,6 +461,7 @@ struct f2fs_nm_info {
nid_t next_scan_nid; /* the next nid to be scanned */
unsigned int ram_thresh; /* control the memory footprint */
unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
+ unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -544,6 +495,9 @@ struct dnode_of_data {
nid_t nid; /* node id of the direct node block */
unsigned int ofs_in_node; /* data offset in the node page */
bool inode_page_locked; /* inode page is locked or not */
+ bool node_changed; /* is node block changed */
+ char cur_level; /* level of hole node page */
+ char max_level; /* level of current page located */
block_t data_blkaddr; /* block address of the node block */
};
@@ -647,6 +601,7 @@ struct f2fs_sm_info {
enum count_type {
F2FS_WRITEBACK,
F2FS_DIRTY_DENTS,
+ F2FS_DIRTY_DATA,
F2FS_DIRTY_NODES,
F2FS_DIRTY_META,
F2FS_INMEM_PAGES,
@@ -673,6 +628,7 @@ enum page_type {
META_FLUSH,
INMEM, /* the below types are used by tracepoints only. */
INMEM_DROP,
+ INMEM_REVOKE,
IPU,
OPU,
};
@@ -681,7 +637,8 @@ struct f2fs_io_info {
struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
- block_t blk_addr; /* block address to be written */
+ block_t new_blkaddr; /* new block address to be written */
+ block_t old_blkaddr; /* old block address before Cow */
struct page *page; /* page to be written */
struct page *encrypted_page; /* encrypted page */
};
@@ -695,6 +652,12 @@ struct f2fs_bio_info {
struct rw_semaphore io_rwsem; /* blocking op for bio */
};
+enum inode_type {
+ DIR_INODE, /* for dirty dir inode */
+ FILE_INODE, /* for dirty regular/symlink inode */
+ NR_INODE_TYPE,
+};
+
/* for inner inode cache management */
struct inode_management {
struct radix_tree_root ino_root; /* ino entry array */
@@ -711,11 +674,17 @@ enum {
SBI_POR_DOING, /* recovery is doing or not */
};
+enum {
+ CP_TIME,
+ REQ_TIME,
+ MAX_TIME,
+};
+
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
- struct buffer_head *raw_super_buf; /* buffer head of raw sb */
struct f2fs_super_block *raw_super; /* raw super block pointer */
+ int valid_super_block; /* valid super block no */
int s_flag; /* flags for sbi */
/* for node-related operations */
@@ -737,23 +706,26 @@ struct f2fs_sb_info {
struct rw_semaphore node_write; /* locking node writes */
struct mutex writepages; /* mutex for writepages() */
wait_queue_head_t cp_wait;
- long cp_expires, cp_interval; /* next expected periodic cp */
+ unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
+ long interval_time[MAX_TIME]; /* to store thresholds */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
/* for orphan inode, use 0'th array */
unsigned int max_orphans; /* max orphan inodes */
- /* for directory inode management */
- struct list_head dir_inode_list; /* dir inode list */
- spinlock_t dir_inode_lock; /* for dir inode list lock */
+ /* for inode management */
+ struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */
+ spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */
/* for extent tree cache */
struct radix_tree_root extent_tree_root;/* cache extent cache entries */
struct rw_semaphore extent_tree_lock; /* locking extent radix tree */
struct list_head extent_list; /* lru list for shrinker */
spinlock_t extent_lock; /* locking extent lru list */
- int total_ext_tree; /* extent tree count */
+ atomic_t total_ext_tree; /* extent tree count */
+ struct list_head zombie_list; /* extent zombie tree list */
+ atomic_t total_zombie_tree; /* extent zombie tree count */
atomic_t total_ext_node; /* extent info count */
/* basic filesystem units */
@@ -771,6 +743,7 @@ struct f2fs_sb_info {
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
unsigned int total_valid_inode_count; /* valid inode count */
+ loff_t max_file_blocks; /* max block index of file */
int active_logs; /* # of active logs */
int dir_level; /* directory level */
@@ -809,7 +782,7 @@ struct f2fs_sb_info {
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
int bg_gc; /* background gc calls */
- unsigned int n_dirty_dirs; /* # of dir inodes */
+ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
#endif
unsigned int last_victim[2]; /* last victim segment # */
spinlock_t stat_lock; /* lock for stat operations */
@@ -822,11 +795,73 @@ struct f2fs_sb_info {
struct list_head s_list;
struct mutex umount_mutex;
unsigned int shrinker_run_no;
+
+ /* For write statistics */
+ u64 sectors_written_start;
+ u64 kbytes_written;
+
+ /* Reference to checksum algorithm driver via cryptoapi */
+ struct crypto_shash *s_chksum_driver;
};
+/* For write statistics. Suppose sector size is 512 bytes,
+ * and the return value is in kbytes. s is of struct f2fs_sb_info.
+ */
+#define BD_PART_WRITTEN(s) \
+(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \
+ s->sectors_written_start) >> 1)
+
+static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
+{
+ sbi->last_time[type] = jiffies;
+}
+
+static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
+{
+ struct timespec ts = {sbi->interval_time[type], 0};
+ unsigned long interval = timespec_to_jiffies(&ts);
+
+ return time_after(jiffies, sbi->last_time[type] + interval);
+}
+
+static inline bool is_idle(struct f2fs_sb_info *sbi)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct request_list *rl = &q->root_rl;
+
+ if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC])
+ return 0;
+
+ return f2fs_time_over(sbi, REQ_TIME);
+}
+
/*
* Inline functions
*/
+static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
+ unsigned int length)
+{
+ SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver);
+ u32 *ctx = (u32 *)shash_desc_ctx(shash);
+ int err;
+
+ shash->tfm = sbi->s_chksum_driver;
+ shash->flags = 0;
+ *ctx = F2FS_SUPER_MAGIC;
+
+ err = crypto_shash_update(shash, address, length);
+ BUG_ON(err);
+
+ return *ctx;
+}
+
+static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
+ void *buf, size_t buf_size)
+{
+ return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
+}
+
static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
{
return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -959,7 +994,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
+ down_write(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
@@ -1059,8 +1094,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_inc_dirty_pages(struct inode *inode)
{
atomic_inc(&F2FS_I(inode)->dirty_pages);
- if (S_ISDIR(inode->i_mode))
- inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+ inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1075,9 +1110,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
return;
atomic_dec(&F2FS_I(inode)->dirty_pages);
-
- if (S_ISDIR(inode->i_mode))
- dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+ dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+ F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
}
static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -1092,8 +1126,7 @@ static inline int get_dirty_pages(struct inode *inode)
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
{
- unsigned int pages_per_sec = sbi->segs_per_sec *
- (1 << sbi->log_blocks_per_seg);
+ unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
return ((get_pages(sbi, block_type) + pages_per_sec - 1)
>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
}
@@ -1261,7 +1294,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
unlock_page(page);
}
- page_cache_release(page);
+ put_page(page);
}
static inline void f2fs_put_dnode(struct dnode_of_data *dn)
@@ -1416,6 +1449,8 @@ enum {
FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
FI_INLINE_DOTS, /* indicate inline dot dentries */
+ FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1478,9 +1513,9 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
}
-static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
+static inline unsigned int addrs_per_inode(struct inode *inode)
{
- if (f2fs_has_inline_xattr(&fi->vfs_inode))
+ if (f2fs_has_inline_xattr(inode))
return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
return DEF_ADDRS_PER_INODE;
}
@@ -1602,13 +1637,11 @@ static inline bool is_dot_dotdot(const struct qstr *str)
static inline bool f2fs_may_extent_tree(struct inode *inode)
{
- mode_t mode = inode->i_mode;
-
if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
return false;
- return S_ISREG(mode);
+ return S_ISREG(inode->i_mode);
}
static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
@@ -1636,10 +1669,10 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
/* get offset of first page in next direct node */
-#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \
- ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \
- (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \
- ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi))
+#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \
+ ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \
+ (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \
+ ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode))
/*
* file.c
@@ -1661,8 +1694,8 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
void f2fs_set_inode_flags(struct inode *);
struct inode *f2fs_iget(struct super_block *, unsigned long);
int try_to_free_nats(struct f2fs_sb_info *, int);
-void update_inode(struct inode *, struct page *);
-void update_inode_page(struct inode *);
+int update_inode(struct inode *, struct page *);
+int update_inode_page(struct inode *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
void f2fs_evict_inode(struct inode *);
void handle_failed_inode(struct inode *);
@@ -1678,10 +1711,10 @@ struct dentry *f2fs_get_parent(struct dentry *child);
extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
void set_de_type(struct f2fs_dir_entry *, umode_t);
-struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *,
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
- unsigned int, struct f2fs_str *);
+ unsigned int, struct fscrypt_str *);
void do_make_empty_dir(struct inode *, struct inode *,
struct f2fs_dentry_ptr *);
struct page *init_inode_metadata(struct inode *, struct inode *,
@@ -1718,6 +1751,7 @@ int f2fs_commit_super(struct f2fs_sb_info *, bool);
int f2fs_sync_fs(struct super_block *, int);
extern __printf(3, 4)
void f2fs_msg(struct super_block *, const char *, const char *, ...);
+int sanity_check_ckpt(struct f2fs_sb_info *sbi);
/*
* hash.c
@@ -1735,6 +1769,7 @@ int need_dentry_mark(struct f2fs_sb_info *, nid_t);
bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
+pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
int truncate_xattr_node(struct inode *, struct page *);
@@ -1766,8 +1801,9 @@ void destroy_node_manager_caches(void);
* segment.c
*/
void register_inmem_page(struct inode *, struct page *);
-int commit_inmem_pages(struct inode *, bool);
-void f2fs_balance_fs(struct f2fs_sb_info *);
+void drop_inmem_pages(struct inode *);
+int commit_inmem_pages(struct inode *);
+void f2fs_balance_fs(struct f2fs_sb_info *, bool);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
int f2fs_issue_flush(struct f2fs_sb_info *);
int create_flush_cmd_control(struct f2fs_sb_info *);
@@ -1787,16 +1823,17 @@ void write_meta_page(struct f2fs_sb_info *, struct page *);
void write_node_page(unsigned int, struct f2fs_io_info *);
void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
void rewrite_data_page(struct f2fs_io_info *);
+void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *,
+ block_t, block_t, bool, bool);
void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
- block_t, block_t, unsigned char, bool);
+ block_t, block_t, unsigned char, bool, bool);
void allocate_data_block(struct f2fs_sb_info *, struct page *,
block_t, block_t *, struct f2fs_summary *, int);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
void write_data_summaries(struct f2fs_sb_info *, block_t);
void write_node_summaries(struct f2fs_sb_info *, block_t);
-int lookup_journal_in_cursum(struct f2fs_summary_block *,
- int, unsigned int, int);
+int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int);
void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
int build_segment_manager(struct f2fs_sb_info *);
void destroy_segment_manager(struct f2fs_sb_info *);
@@ -1813,9 +1850,9 @@ bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
-void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void release_dirty_inode(struct f2fs_sb_info *);
+void add_ino_entry(struct f2fs_sb_info *, nid_t, int type);
+void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type);
+void release_ino_entry(struct f2fs_sb_info *);
bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
@@ -1825,9 +1862,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
void update_dirty_page(struct inode *, struct page *);
void add_dirty_dir_inode(struct inode *);
-void remove_dirty_dir_inode(struct inode *);
-void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
+void remove_dirty_inode(struct inode *);
+int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type);
+int write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
void init_ino_entry_info(struct f2fs_sb_info *);
int __init create_checkpoint_caches(void);
void destroy_checkpoint_caches(void);
@@ -1836,17 +1873,23 @@ void destroy_checkpoint_caches(void);
* data.c
*/
void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
+void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
+ struct page *, nid_t, enum page_type, int);
+void f2fs_flush_merged_bios(struct f2fs_sb_info *);
int f2fs_submit_page_bio(struct f2fs_io_info *);
void f2fs_submit_page_mbio(struct f2fs_io_info *);
void set_data_blkaddr(struct dnode_of_data *);
+void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
int reserve_new_block(struct dnode_of_data *);
int f2fs_get_block(struct dnode_of_data *, pgoff_t);
+ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
struct page *find_data_page(struct inode *, pgoff_t);
struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int do_write_data_page(struct f2fs_io_info *);
+int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
int f2fs_release_page(struct page *, gfp_t);
@@ -1856,7 +1899,7 @@ int f2fs_release_page(struct page *, gfp_t);
*/
int start_gc_thread(struct f2fs_sb_info *);
void stop_gc_thread(struct f2fs_sb_info *);
-block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
+block_t start_bidx_of_node(unsigned int, struct inode *);
int f2fs_gc(struct f2fs_sb_info *, bool);
void build_gc_manager(struct f2fs_sb_info *);
@@ -1877,8 +1920,9 @@ struct f2fs_stat_info {
int main_area_segs, main_area_sections, main_area_zones;
unsigned long long hit_largest, hit_cached, hit_rbtree;
unsigned long long hit_total, total_ext;
- int ext_tree, ext_node;
- int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
+ int ext_tree, zombie_tree, ext_node;
+ int ndirty_node, ndirty_meta;
+ int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files;
int nats, dirty_nats, sits, dirty_sits, fnids;
int total_count, utilization;
int bg_gc, inmem_pages, wb_pages;
@@ -1888,7 +1932,7 @@ struct f2fs_stat_info {
int util_free, util_valid, util_invalid;
int rsvd_segs, overp_segs;
int dirty_count, node_pages, meta_pages;
- int prefree_count, call_count, cp_count;
+ int prefree_count, call_count, cp_count, bg_cp_count;
int tot_segs, node_segs, data_segs, free_segs, free_secs;
int bg_node_segs, bg_data_segs;
int tot_blks, data_blks, node_blks;
@@ -1909,10 +1953,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
}
#define stat_inc_cp_count(si) ((si)->cp_count++)
+#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++)
#define stat_inc_call_count(si) ((si)->call_count++)
#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
-#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++)
-#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--)
+#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
+#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--)
#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree))
#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest))
@@ -1987,14 +2032,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
int f2fs_build_stats(struct f2fs_sb_info *);
void f2fs_destroy_stats(struct f2fs_sb_info *);
-void __init f2fs_create_root_stats(void);
+int __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
#define stat_inc_cp_count(si)
+#define stat_inc_bg_cp_count(si)
#define stat_inc_call_count(si)
#define stat_inc_bggc_count(si)
-#define stat_inc_dirty_dir(sbi)
-#define stat_dec_dirty_dir(sbi)
+#define stat_inc_dirty_inode(sbi, type)
+#define stat_dec_dirty_inode(sbi, type)
#define stat_inc_total_hit(sb)
#define stat_inc_rbtree_node_hit(sb)
#define stat_inc_largest_node_hit(sbi)
@@ -2015,7 +2061,7 @@ void f2fs_destroy_root_stats(void);
static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline void __init f2fs_create_root_stats(void) { }
+static inline int __init f2fs_create_root_stats(void) { return 0; }
static inline void f2fs_destroy_root_stats(void) { }
#endif
@@ -2044,7 +2090,7 @@ int f2fs_convert_inline_inode(struct inode *);
int f2fs_write_inline_data(struct inode *, struct page *);
bool recover_inline_data(struct inode *, struct page *);
struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
- struct f2fs_filename *, struct page **);
+ struct fscrypt_name *, struct page **);
struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
@@ -2053,7 +2099,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
struct inode *, struct inode *);
bool f2fs_empty_inline_dir(struct inode *);
int f2fs_read_inline_dir(struct file *, struct dir_context *,
- struct f2fs_str *);
+ struct fscrypt_str *);
int f2fs_inline_data_fiemap(struct inode *,
struct fiemap_extent_info *, __u64, __u64);
@@ -2069,8 +2115,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *);
* extent_cache.c
*/
unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
-void f2fs_drop_largest_extent(struct inode *, pgoff_t);
-void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
+bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
unsigned int f2fs_destroy_extent_node(struct inode *);
void f2fs_destroy_extent_tree(struct inode *);
bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
@@ -2084,13 +2129,9 @@ void destroy_extent_cache(void);
/*
* crypto support
*/
-static inline int f2fs_encrypted_inode(struct inode *inode)
+static inline bool f2fs_encrypted_inode(struct inode *inode)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
return file_is_encrypt(inode);
-#else
- return 0;
-#endif
}
static inline void f2fs_set_encrypted_inode(struct inode *inode)
@@ -2102,26 +2143,18 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
static inline bool f2fs_bio_encrypted(struct bio *bio)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- return unlikely(bio->bi_private != NULL);
-#else
- return false;
-#endif
+ return bio->bi_private != NULL;
}
static inline int f2fs_sb_has_crypto(struct super_block *sb)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
-#else
- return 0;
-#endif
}
static inline bool f2fs_may_encrypt(struct inode *inode)
{
#ifdef CONFIG_F2FS_FS_ENCRYPTION
- mode_t mode = inode->i_mode;
+ umode_t mode = inode->i_mode;
return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
#else
@@ -2129,86 +2162,28 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
#endif
}
-/* crypto_policy.c */
-int f2fs_is_child_context_consistent_with_parent(struct inode *,
- struct inode *);
-int f2fs_inherit_context(struct inode *, struct inode *, struct page *);
-int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *);
-int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *);
-
-/* crypt.c */
-extern struct kmem_cache *f2fs_crypt_info_cachep;
-bool f2fs_valid_contents_enc_mode(uint32_t);
-uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t);
-struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *);
-void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *);
-struct page *f2fs_encrypt(struct inode *, struct page *);
-int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *);
-int f2fs_decrypt_one(struct inode *, struct page *);
-void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *);
-
-/* crypto_key.c */
-void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *);
-int _f2fs_get_encryption_info(struct inode *inode);
-
-/* crypto_fname.c */
-bool f2fs_valid_filenames_enc_mode(uint32_t);
-u32 f2fs_fname_crypto_round_up(u32, u32);
-int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *);
-int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *,
- const struct f2fs_str *, struct f2fs_str *);
-int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *,
- struct f2fs_str *);
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-void f2fs_restore_and_release_control_page(struct page **);
-void f2fs_restore_control_page(struct page *);
-
-int __init f2fs_init_crypto(void);
-int f2fs_crypto_initialize(void);
-void f2fs_exit_crypto(void);
-
-int f2fs_has_encryption_key(struct inode *);
-
-static inline int f2fs_get_encryption_info(struct inode *inode)
-{
- struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
- if (!ci ||
- (ci->ci_keyring_key &&
- (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
- (1 << KEY_FLAG_REVOKED) |
- (1 << KEY_FLAG_DEAD)))))
- return _f2fs_get_encryption_info(inode);
- return 0;
-}
-
-void f2fs_fname_crypto_free_buffer(struct f2fs_str *);
-int f2fs_fname_setup_filename(struct inode *, const struct qstr *,
- int lookup, struct f2fs_filename *);
-void f2fs_fname_free_filename(struct f2fs_filename *);
-#else
-static inline void f2fs_restore_and_release_control_page(struct page **p) { }
-static inline void f2fs_restore_control_page(struct page *p) { }
-
-static inline int __init f2fs_init_crypto(void) { return 0; }
-static inline void f2fs_exit_crypto(void) { }
-
-static inline int f2fs_has_encryption_key(struct inode *i) { return 0; }
-static inline int f2fs_get_encryption_info(struct inode *i) { return 0; }
-static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { }
-
-static inline int f2fs_fname_setup_filename(struct inode *dir,
- const struct qstr *iname,
- int lookup, struct f2fs_filename *fname)
-{
- memset(fname, 0, sizeof(struct f2fs_filename));
- fname->usr_fname = iname;
- fname->disk_name.name = (unsigned char *)iname->name;
- fname->disk_name.len = iname->len;
- return 0;
-}
-
-static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { }
+#ifndef CONFIG_F2FS_FS_ENCRYPTION
+#define fscrypt_set_d_op(i)
+#define fscrypt_get_ctx fscrypt_notsupp_get_ctx
+#define fscrypt_release_ctx fscrypt_notsupp_release_ctx
+#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page
+#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page
+#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages
+#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page
+#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page
+#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range
+#define fscrypt_process_policy fscrypt_notsupp_process_policy
+#define fscrypt_get_policy fscrypt_notsupp_get_policy
+#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context
+#define fscrypt_inherit_context fscrypt_notsupp_inherit_context
+#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info
+#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info
+#define fscrypt_setup_filename fscrypt_notsupp_setup_filename
+#define fscrypt_free_filename fscrypt_notsupp_free_filename
+#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size
+#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer
+#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer
+#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr
+#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk
#endif
#endif
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
deleted file mode 100644
index c2c1c2b63b25..000000000000
--- a/fs/f2fs/f2fs_crypto.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * linux/fs/f2fs/f2fs_crypto.h
- *
- * Copied from linux/fs/ext4/ext4_crypto.h
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption header content for f2fs
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-#ifndef _F2FS_CRYPTO_H
-#define _F2FS_CRYPTO_H
-
-#include <linux/fs.h>
-
-#define F2FS_KEY_DESCRIPTOR_SIZE 8
-
-/* Policy provided via an ioctl on the topmost directory */
-struct f2fs_encryption_policy {
- char version;
- char contents_encryption_mode;
- char filenames_encryption_mode;
- char flags;
- char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
-} __attribute__((__packed__));
-
-#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1
-#define F2FS_KEY_DERIVATION_NONCE_SIZE 16
-
-#define F2FS_POLICY_FLAGS_PAD_4 0x00
-#define F2FS_POLICY_FLAGS_PAD_8 0x01
-#define F2FS_POLICY_FLAGS_PAD_16 0x02
-#define F2FS_POLICY_FLAGS_PAD_32 0x03
-#define F2FS_POLICY_FLAGS_PAD_MASK 0x03
-#define F2FS_POLICY_FLAGS_VALID 0x03
-
-/**
- * Encryption context for inode
- *
- * Protector format:
- * 1 byte: Protector format (1 = this version)
- * 1 byte: File contents encryption mode
- * 1 byte: File names encryption mode
- * 1 byte: Flags
- * 8 bytes: Master Key descriptor
- * 16 bytes: Encryption Key derivation nonce
- */
-struct f2fs_encryption_context {
- char format;
- char contents_encryption_mode;
- char filenames_encryption_mode;
- char flags;
- char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
- char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE];
-} __attribute__((__packed__));
-
-/* Encryption parameters */
-#define F2FS_XTS_TWEAK_SIZE 16
-#define F2FS_AES_128_ECB_KEY_SIZE 16
-#define F2FS_AES_256_GCM_KEY_SIZE 32
-#define F2FS_AES_256_CBC_KEY_SIZE 32
-#define F2FS_AES_256_CTS_KEY_SIZE 32
-#define F2FS_AES_256_XTS_KEY_SIZE 64
-#define F2FS_MAX_KEY_SIZE 64
-
-#define F2FS_KEY_DESC_PREFIX "f2fs:"
-#define F2FS_KEY_DESC_PREFIX_SIZE 5
-
-struct f2fs_encryption_key {
- __u32 mode;
- char raw[F2FS_MAX_KEY_SIZE];
- __u32 size;
-} __attribute__((__packed__));
-
-struct f2fs_crypt_info {
- char ci_data_mode;
- char ci_filename_mode;
- char ci_flags;
- struct crypto_ablkcipher *ci_ctfm;
- struct key *ci_keyring_key;
- char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE];
-};
-
-#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
-#define F2FS_WRITE_PATH_FL 0x00000002
-
-struct f2fs_crypto_ctx {
- union {
- struct {
- struct page *bounce_page; /* Ciphertext page */
- struct page *control_page; /* Original page */
- } w;
- struct {
- struct bio *bio;
- struct work_struct work;
- } r;
- struct list_head free_list; /* Free list */
- };
- char flags; /* Flags */
-};
-
-struct f2fs_completion_result {
- struct completion completion;
- int res;
-};
-
-#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \
- struct f2fs_completion_result ecr = { \
- COMPLETION_INITIALIZER((ecr).completion), 0 }
-
-static inline int f2fs_encryption_key_size(int mode)
-{
- switch (mode) {
- case F2FS_ENCRYPTION_MODE_AES_256_XTS:
- return F2FS_AES_256_XTS_KEY_SIZE;
- case F2FS_ENCRYPTION_MODE_AES_256_GCM:
- return F2FS_AES_256_GCM_KEY_SIZE;
- case F2FS_ENCRYPTION_MODE_AES_256_CBC:
- return F2FS_AES_256_CBC_KEY_SIZE;
- case F2FS_ENCRYPTION_MODE_AES_256_CTS:
- return F2FS_AES_256_CTS_KEY_SIZE;
- default:
- BUG();
- }
- return 0;
-}
-
-#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4
-#define F2FS_CRYPTO_BLOCK_SIZE 16
-#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct f2fs_encrypted_symlink_data {
- __le16 len;
- char encrypted_path[1];
-} __attribute__((__packed__));
-
-/**
- * This function is used to calculate the disk space required to
- * store a filename of length l in encrypted symlink format.
- */
-static inline u32 encrypted_symlink_data_len(u32 l)
-{
- return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1);
-}
-#endif /* _F2FS_CRYPTO_H */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index a197215ad52b..90d1157a09f9 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -40,8 +40,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
struct dnode_of_data dn;
int err;
- f2fs_balance_fs(sbi);
-
sb_start_pagefault(inode->i_sb);
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -57,6 +55,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
+ f2fs_balance_fs(sbi, dn.node_changed);
+
file_update_time(vma->vm_file);
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
@@ -74,11 +74,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
goto mapped;
/* page is wholly or partially inside EOF */
- if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) >
+ if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
i_size_read(inode)) {
unsigned offset;
- offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ offset = i_size_read(inode) & ~PAGE_MASK;
+ zero_user_segment(page, offset, PAGE_SIZE);
}
set_page_dirty(page);
SetPageUptodate(page);
@@ -86,7 +86,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
trace_f2fs_vm_page_mkwrite(page, DATA);
mapped:
/* fill the page */
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, false);
/* wait for GCed encrypted page writeback */
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -96,6 +96,7 @@ mapped:
clear_cold_data(page);
out:
sb_end_pagefault(inode->i_sb);
+ f2fs_update_time(sbi, REQ_TIME);
return block_page_mkwrite_return(err);
}
@@ -201,7 +202,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_f2fs_sync_file_enter(inode);
/* if fdatasync is triggered, let's do in-place-update */
- if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
+ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
set_inode_flag(fi, FI_NEED_IPU);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
clear_inode_flag(fi, FI_NEED_IPU);
@@ -233,9 +234,6 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
go_write:
- /* guarantee free sections for fsync */
- f2fs_balance_fs(sbi);
-
/*
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
@@ -261,8 +259,10 @@ sync_nodes:
sync_node_pages(sbi, ino, &wbc);
/* if cp_error was enabled, we should avoid infinite loop */
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ ret = -EIO;
goto out;
+ }
if (need_inode_block_update(sbi, ino)) {
mark_inode_dirty_sync(inode);
@@ -275,12 +275,13 @@ sync_nodes:
goto out;
/* once recovery info is written, don't need to tack this */
- remove_dirty_inode(sbi, ino, APPEND_INO);
+ remove_ino_entry(sbi, ino, APPEND_INO);
clear_inode_flag(fi, FI_APPEND_WRITE);
flush_out:
- remove_dirty_inode(sbi, ino, UPDATE_INO);
+ remove_ino_entry(sbi, ino, UPDATE_INO);
clear_inode_flag(fi, FI_UPDATE_WRITE);
ret = f2fs_issue_flush(sbi);
+ f2fs_update_time(sbi, REQ_TIME);
out:
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
f2fs_trace_ios(NULL, 1);
@@ -300,7 +301,7 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping,
pagevec_init(&pvec, 0);
nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
PAGECACHE_TAG_DIRTY, 1);
- pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX;
+ pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX;
pagevec_release(&pvec);
return pgofs;
}
@@ -332,7 +333,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
loff_t isize;
int err = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
isize = i_size_read(inode);
if (offset >= isize)
@@ -345,11 +346,11 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
goto found;
}
- pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT);
+ pgofs = (pgoff_t)(offset >> PAGE_SHIFT);
dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
- for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
+ for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
if (err && err != -ENOENT) {
@@ -357,20 +358,19 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
} else if (err == -ENOENT) {
/* direct node does not exists */
if (whence == SEEK_DATA) {
- pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
- F2FS_I(inode));
+ pgofs = get_next_page_offset(&dn, pgofs);
continue;
} else {
goto found;
}
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
/* find data/hole in dnode block */
for (; dn.ofs_in_node < end_offset;
dn.ofs_in_node++, pgofs++,
- data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
+ data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
block_t blkaddr;
blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
@@ -387,10 +387,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
found:
if (whence == SEEK_HOLE && data_ofs > isize)
data_ofs = isize;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return vfs_setpos(file, data_ofs, maxbytes);
fail:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -ENXIO;
}
@@ -418,19 +418,20 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
+ int err;
if (f2fs_encrypted_inode(inode)) {
- int err = f2fs_get_encryption_info(inode);
+ err = fscrypt_get_encryption_info(inode);
if (err)
return 0;
+ if (!f2fs_encrypted_inode(inode))
+ return -ENOKEY;
}
/* we don't need to use inline_data strictly */
- if (f2fs_has_inline_data(inode)) {
- int err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
file_accessed(file);
vma->vm_ops = &f2fs_file_vm_ops;
@@ -440,12 +441,22 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
static int f2fs_file_open(struct inode *inode, struct file *filp)
{
int ret = generic_file_open(inode, filp);
+ struct dentry *dir;
if (!ret && f2fs_encrypted_inode(inode)) {
- ret = f2fs_get_encryption_info(inode);
+ ret = fscrypt_get_encryption_info(inode);
if (ret)
- ret = -EACCES;
+ return -EACCES;
+ if (!fscrypt_has_encryption_key(inode))
+ return -ENOKEY;
}
+ dir = dget_parent(file_dentry(filp));
+ if (f2fs_encrypted_inode(d_inode(dir)) &&
+ !fscrypt_has_permitted_context(d_inode(dir), inode)) {
+ dput(dir);
+ return -EPERM;
+ }
+ dput(dir);
return ret;
}
@@ -480,14 +491,14 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
* we will invalidate all blkaddr in the whole range.
*/
fofs = start_bidx_of_node(ofs_of_node(dn->node_page),
- F2FS_I(dn->inode)) + ofs;
+ dn->inode) + ofs;
f2fs_update_extent_cache_range(dn, fofs, 0, len);
dec_valid_block_count(sbi, dn->inode, nr_free);
- set_page_dirty(dn->node_page);
sync_inode_page(dn);
}
dn->ofs_in_node = ofs;
+ f2fs_update_time(sbi, REQ_TIME);
trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
dn->ofs_in_node, nr_free);
return nr_free;
@@ -501,8 +512,8 @@ void truncate_data_blocks(struct dnode_of_data *dn)
static int truncate_partial_data_page(struct inode *inode, u64 from,
bool cache_only)
{
- unsigned offset = from & (PAGE_CACHE_SIZE - 1);
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE - 1);
+ pgoff_t index = from >> PAGE_SHIFT;
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -521,9 +532,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
if (IS_ERR(page))
return 0;
truncate_out:
- f2fs_wait_on_page_writeback(page, DATA);
- zero_user(page, offset, PAGE_CACHE_SIZE - offset);
- if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+ f2fs_wait_on_page_writeback(page, DATA, true);
+ zero_user(page, offset, PAGE_SIZE - offset);
+ if (!cache_only || !f2fs_encrypted_inode(inode) ||
+ !S_ISREG(inode->i_mode))
set_page_dirty(page);
f2fs_put_page(page, 1);
return 0;
@@ -568,7 +580,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
goto out;
}
- count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ count = ADDRS_PER_PAGE(dn.node_page, inode);
count -= dn.ofs_in_node;
f2fs_bug_on(sbi, count < 0);
@@ -604,7 +616,7 @@ int f2fs_truncate(struct inode *inode, bool lock)
trace_f2fs_truncate(inode);
/* we should check inline_data size */
- if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) {
+ if (!f2fs_may_inline_data(inode)) {
err = f2fs_convert_inline_inode(inode);
if (err)
return err;
@@ -671,7 +683,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_valid & ATTR_SIZE) {
if (f2fs_encrypted_inode(inode) &&
- f2fs_get_encryption_info(inode))
+ fscrypt_get_encryption_info(inode))
return -EACCES;
if (attr->ia_size <= i_size_read(inode)) {
@@ -679,13 +691,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
err = f2fs_truncate(inode, true);
if (err)
return err;
- f2fs_balance_fs(F2FS_I_SB(inode));
+ f2fs_balance_fs(F2FS_I_SB(inode), true);
} else {
/*
* do not trim all blocks after i_size if target size is
* larger than i_size.
*/
truncate_setsize(inode, attr->ia_size);
+
+ /* should convert inline inode here */
+ if (!f2fs_may_inline_data(inode)) {
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
+ }
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
}
}
@@ -727,7 +746,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
if (!len)
return 0;
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
page = get_new_data_page(inode, NULL, index, false);
@@ -736,7 +755,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
if (IS_ERR(page))
return PTR_ERR(page);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
zero_user(page, start, len);
set_page_dirty(page);
f2fs_put_page(page, 1);
@@ -761,7 +780,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
return err;
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
@@ -778,19 +797,17 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
pgoff_t pg_start, pg_end;
loff_t off_start, off_end;
- int ret = 0;
+ int ret;
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
- pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
- pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+ pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
- off_start = offset & (PAGE_CACHE_SIZE - 1);
- off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+ off_start = offset & (PAGE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_SIZE - 1);
if (pg_start == pg_end) {
ret = fill_zero(inode, pg_start, off_start,
@@ -800,7 +817,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
} else {
if (off_start) {
ret = fill_zero(inode, pg_start++, off_start,
- PAGE_CACHE_SIZE - off_start);
+ PAGE_SIZE - off_start);
if (ret)
return ret;
}
@@ -815,10 +832,10 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
loff_t blk_start, blk_end;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
- blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT;
- blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT;
+ blk_start = (loff_t)pg_start << PAGE_SHIFT;
+ blk_end = (loff_t)pg_end << PAGE_SHIFT;
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
@@ -849,10 +866,8 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
} else {
new_addr = dn.data_blkaddr;
if (!is_checkpointed_data(sbi, new_addr)) {
- dn.data_blkaddr = NULL_ADDR;
/* do not invalidate this block address */
- set_data_blkaddr(&dn);
- f2fs_update_extent_cache(&dn);
+ f2fs_update_data_blkaddr(&dn, NULL_ADDR);
do_replace = true;
}
f2fs_put_dnode(&dn);
@@ -879,7 +894,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
get_node_info(sbi, dn.nid, &ni);
f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
- ni.version, true);
+ ni.version, true, false);
f2fs_put_dnode(&dn);
} else {
struct page *psrc, *pdst;
@@ -887,7 +902,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
psrc = get_lock_data_page(inode, src, true);
if (IS_ERR(psrc))
return PTR_ERR(psrc);
- pdst = get_new_data_page(inode, NULL, dst, false);
+ pdst = get_new_data_page(inode, NULL, dst, true);
if (IS_ERR(pdst)) {
f2fs_put_page(psrc, 1);
return PTR_ERR(pdst);
@@ -903,9 +918,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
err_out:
if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
- dn.data_blkaddr = new_addr;
- set_data_blkaddr(&dn);
- f2fs_update_extent_cache(&dn);
+ f2fs_update_data_blkaddr(&dn, new_addr);
f2fs_put_dnode(&dn);
}
return ret;
@@ -918,7 +931,7 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
int ret = 0;
for (; end < nrpages; start++, end++) {
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
ret = __exchange_data_block(inode, end, start, true);
f2fs_unlock_op(sbi);
@@ -941,16 +954,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
return -EINVAL;
- f2fs_balance_fs(F2FS_I_SB(inode));
-
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
- pg_start = offset >> PAGE_CACHE_SHIFT;
- pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+ pg_start = offset >> PAGE_SHIFT;
+ pg_end = (offset + len) >> PAGE_SHIFT;
/* write out all dirty pages from offset */
ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -991,13 +1000,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret)
return ret;
- f2fs_balance_fs(sbi);
-
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
if (ret)
@@ -1005,11 +1010,11 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
truncate_pagecache_range(inode, offset, offset + len - 1);
- pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
- pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+ pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
- off_start = offset & (PAGE_CACHE_SIZE - 1);
- off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+ off_start = offset & (PAGE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_SIZE - 1);
if (pg_start == pg_end) {
ret = fill_zero(inode, pg_start, off_start,
@@ -1023,12 +1028,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
} else {
if (off_start) {
ret = fill_zero(inode, pg_start++, off_start,
- PAGE_CACHE_SIZE - off_start);
+ PAGE_SIZE - off_start);
if (ret)
return ret;
new_size = max_t(loff_t, new_size,
- (loff_t)pg_start << PAGE_CACHE_SHIFT);
+ (loff_t)pg_start << PAGE_SHIFT);
}
for (index = pg_start; index < pg_end; index++) {
@@ -1053,18 +1058,13 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (dn.data_blkaddr != NEW_ADDR) {
invalidate_blocks(sbi, dn.data_blkaddr);
-
- dn.data_blkaddr = NEW_ADDR;
- set_data_blkaddr(&dn);
-
- dn.data_blkaddr = NULL_ADDR;
- f2fs_update_extent_cache(&dn);
+ f2fs_update_data_blkaddr(&dn, NEW_ADDR);
}
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
new_size = max_t(loff_t, new_size,
- (loff_t)(index + 1) << PAGE_CACHE_SHIFT);
+ (loff_t)(index + 1) << PAGE_SHIFT);
}
if (off_end) {
@@ -1104,13 +1104,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
return -EINVAL;
- f2fs_balance_fs(sbi);
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ f2fs_balance_fs(sbi, true);
ret = truncate_blocks(inode, i_size_read(inode), true);
if (ret)
@@ -1123,8 +1121,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
truncate_pagecache(inode, offset);
- pg_start = offset >> PAGE_CACHE_SHIFT;
- pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+ pg_start = offset >> PAGE_SHIFT;
+ pg_end = (offset + len) >> PAGE_SHIFT;
delta = pg_end - pg_start;
nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
@@ -1154,23 +1152,21 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
loff_t off_start, off_end;
int ret = 0;
- f2fs_balance_fs(sbi);
-
ret = inode_newsize_ok(inode, (len + offset));
if (ret)
return ret;
- if (f2fs_has_inline_data(inode)) {
- ret = f2fs_convert_inline_inode(inode);
- if (ret)
- return ret;
- }
+ ret = f2fs_convert_inline_inode(inode);
+ if (ret)
+ return ret;
- pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
- pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+ f2fs_balance_fs(sbi, true);
- off_start = offset & (PAGE_CACHE_SIZE - 1);
- off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+ pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
+
+ off_start = offset & (PAGE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_SIZE - 1);
f2fs_lock_op(sbi);
@@ -1188,12 +1184,12 @@ noalloc:
if (pg_start == pg_end)
new_size = offset + len;
else if (index == pg_start && off_start)
- new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT;
+ new_size = (loff_t)(index + 1) << PAGE_SHIFT;
else if (index == pg_end)
- new_size = ((loff_t)index << PAGE_CACHE_SHIFT) +
+ new_size = ((loff_t)index << PAGE_SHIFT) +
off_end;
else
- new_size += PAGE_CACHE_SIZE;
+ new_size += PAGE_SIZE;
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -1226,7 +1222,7 @@ static long f2fs_fallocate(struct file *file, int mode,
FALLOC_FL_INSERT_RANGE))
return -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (mode & FALLOC_FL_PUNCH_HOLE) {
if (offset >= inode->i_size)
@@ -1246,10 +1242,11 @@ static long f2fs_fallocate(struct file *file, int mode,
if (!ret) {
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
trace_f2fs_fallocate(inode, mode, offset, len, ret);
return ret;
@@ -1259,7 +1256,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
{
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- commit_inmem_pages(inode, true);
+ drop_inmem_pages(inode);
if (f2fs_is_volatile_file(inode)) {
set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
filemap_fdatawrite(inode->i_mapping);
@@ -1313,13 +1310,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
flags = f2fs_mask_flags(inode->i_mode, flags);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
oldflags = fi->i_flags;
if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ret = -EPERM;
goto out;
}
@@ -1328,7 +1325,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
flags = flags & FS_FL_USER_MODIFIABLE;
flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
fi->i_flags = flags;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
f2fs_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME;
@@ -1353,8 +1350,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
- f2fs_balance_fs(F2FS_I_SB(inode));
-
if (f2fs_is_atomic_file(inode))
return 0;
@@ -1363,6 +1358,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
return ret;
set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+
return 0;
}
@@ -1383,9 +1380,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (f2fs_is_atomic_file(inode)) {
clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
- ret = commit_inmem_pages(inode, false);
- if (ret)
+ ret = commit_inmem_pages(inode);
+ if (ret) {
+ set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
goto err_out;
+ }
}
ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
@@ -1410,6 +1409,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
return ret;
set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return 0;
}
@@ -1441,13 +1441,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
if (ret)
return ret;
- f2fs_balance_fs(F2FS_I_SB(inode));
-
- clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
- clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
- commit_inmem_pages(inode, true);
+ if (f2fs_is_atomic_file(inode)) {
+ clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ drop_inmem_pages(inode);
+ }
+ if (f2fs_is_volatile_file(inode)) {
+ clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+ }
mnt_drop_write_file(filp);
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return ret;
}
@@ -1487,6 +1491,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
default:
return -EINVAL;
}
+ f2fs_update_time(sbi, REQ_TIME);
return 0;
}
@@ -1517,6 +1522,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
if (copy_to_user((struct fstrim_range __user *)arg, &range,
sizeof(range)))
return -EFAULT;
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return 0;
}
@@ -1532,38 +1538,30 @@ static bool uuid_is_nonzero(__u8 u[16])
static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- struct f2fs_encryption_policy policy;
+ struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
- if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg,
- sizeof(policy)))
+ if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
+ sizeof(policy)))
return -EFAULT;
- return f2fs_process_policy(&policy, inode);
-#else
- return -EOPNOTSUPP;
-#endif
+ f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+ return fscrypt_process_policy(inode, &policy);
}
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
{
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- struct f2fs_encryption_policy policy;
+ struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
int err;
- err = f2fs_get_policy(inode, &policy);
+ err = fscrypt_get_policy(inode, &policy);
if (err)
return err;
- if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy,
- sizeof(policy)))
+ if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
return -EFAULT;
return 0;
-#else
- return -EOPNOTSUPP;
-#endif
}
static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1586,13 +1584,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
err = f2fs_commit_super(sbi, false);
-
- mnt_drop_write_file(filp);
if (err) {
/* undo new data */
memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
+ mnt_drop_write_file(filp);
return err;
}
+ mnt_drop_write_file(filp);
got_it:
if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
16))
@@ -1629,7 +1627,6 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct cp_control cpc;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1637,13 +1634,196 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
if (f2fs_readonly(sbi->sb))
return -EROFS;
- cpc.reason = __get_cp_reason(sbi);
+ return f2fs_sync_fs(sbi->sb, 1);
+}
- mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, &cpc);
- mutex_unlock(&sbi->gc_mutex);
+static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
+ struct file *filp,
+ struct f2fs_defragment *range)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_map_blocks map = { .m_next_pgofs = NULL };
+ struct extent_info ei;
+ pgoff_t pg_start, pg_end;
+ unsigned int blk_per_seg = sbi->blocks_per_seg;
+ unsigned int total = 0, sec_num;
+ unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg;
+ block_t blk_end = 0;
+ bool fragmented = false;
+ int err;
- return 0;
+ /* if in-place-update policy is enabled, don't waste time here */
+ if (need_inplace_update(inode))
+ return -EINVAL;
+
+ pg_start = range->start >> PAGE_SHIFT;
+ pg_end = (range->start + range->len) >> PAGE_SHIFT;
+
+ f2fs_balance_fs(sbi, true);
+
+ inode_lock(inode);
+
+ /* writeback all dirty pages in the range */
+ err = filemap_write_and_wait_range(inode->i_mapping, range->start,
+ range->start + range->len - 1);
+ if (err)
+ goto out;
+
+ /*
+ * lookup mapping info in extent cache, skip defragmenting if physical
+ * block addresses are continuous.
+ */
+ if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+ if (ei.fofs + ei.len >= pg_end)
+ goto out;
+ }
+
+ map.m_lblk = pg_start;
+
+ /*
+ * lookup mapping info in dnode page cache, skip defragmenting if all
+ * physical block addresses are continuous even if there are hole(s)
+ * in logical blocks.
+ */
+ while (map.m_lblk < pg_end) {
+ map.m_len = pg_end - map.m_lblk;
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ if (err)
+ goto out;
+
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ map.m_lblk++;
+ continue;
+ }
+
+ if (blk_end && blk_end != map.m_pblk) {
+ fragmented = true;
+ break;
+ }
+ blk_end = map.m_pblk + map.m_len;
+
+ map.m_lblk += map.m_len;
+ }
+
+ if (!fragmented)
+ goto out;
+
+ map.m_lblk = pg_start;
+ map.m_len = pg_end - pg_start;
+
+ sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+
+ /*
+ * make sure there are enough free section for LFS allocation, this can
+ * avoid defragment running in SSR mode when free section are allocated
+ * intensively
+ */
+ if (has_not_enough_free_secs(sbi, sec_num)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ while (map.m_lblk < pg_end) {
+ pgoff_t idx;
+ int cnt = 0;
+
+do_map:
+ map.m_len = pg_end - map.m_lblk;
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ if (err)
+ goto clear_out;
+
+ if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+ map.m_lblk++;
+ continue;
+ }
+
+ set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+
+ idx = map.m_lblk;
+ while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
+ struct page *page;
+
+ page = get_lock_data_page(inode, idx, true);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto clear_out;
+ }
+
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+
+ idx++;
+ cnt++;
+ total++;
+ }
+
+ map.m_lblk = idx;
+
+ if (idx < pg_end && cnt < blk_per_seg)
+ goto do_map;
+
+ clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+
+ err = filemap_fdatawrite(inode->i_mapping);
+ if (err)
+ goto out;
+ }
+clear_out:
+ clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+out:
+ inode_unlock(inode);
+ if (!err)
+ range->len = (u64)total << PAGE_SHIFT;
+ return err;
+}
+
+static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_defragment range;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ if (f2fs_readonly(sbi->sb)) {
+ err = -EROFS;
+ goto out;
+ }
+
+ if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
+ sizeof(range))) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* verify alignment of offset & size */
+ if (range.start & (F2FS_BLKSIZE - 1) ||
+ range.len & (F2FS_BLKSIZE - 1)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = f2fs_defragment_range(sbi, filp, &range);
+ f2fs_update_time(sbi, REQ_TIME);
+ if (err < 0)
+ goto out;
+
+ if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
+ sizeof(range)))
+ err = -EFAULT;
+out:
+ mnt_drop_write_file(filp);
+ return err;
}
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -1679,6 +1859,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_gc(filp, arg);
case F2FS_IOC_WRITE_CHECKPOINT:
return f2fs_ioc_write_checkpoint(filp, arg);
+ case F2FS_IOC_DEFRAGMENT:
+ return f2fs_ioc_defragment(filp, arg);
default:
return -ENOTTY;
}
@@ -1686,14 +1868,32 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct inode *inode = file_inode(iocb->ki_filp);
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ ssize_t ret;
if (f2fs_encrypted_inode(inode) &&
- !f2fs_has_encryption_key(inode) &&
- f2fs_get_encryption_info(inode))
+ !fscrypt_has_encryption_key(inode) &&
+ fscrypt_get_encryption_info(inode))
return -EACCES;
- return generic_file_write_iter(iocb, from);
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0) {
+ ret = f2fs_preallocate_blocks(iocb, from);
+ if (!ret)
+ ret = __generic_file_write_iter(iocb, from);
+ }
+ inode_unlock(inode);
+
+ if (ret > 0) {
+ ssize_t err;
+
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
+ ret = err;
+ }
+ return ret;
}
#ifdef CONFIG_COMPAT
@@ -1706,6 +1906,22 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC32_SETFLAGS:
cmd = F2FS_IOC_SETFLAGS;
break;
+ case F2FS_IOC32_GETVERSION:
+ cmd = F2FS_IOC_GETVERSION;
+ break;
+ case F2FS_IOC_START_ATOMIC_WRITE:
+ case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+ case F2FS_IOC_START_VOLATILE_WRITE:
+ case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+ case F2FS_IOC_ABORT_VOLATILE_WRITE:
+ case F2FS_IOC_SHUTDOWN:
+ case F2FS_IOC_SET_ENCRYPTION_POLICY:
+ case F2FS_IOC_GET_ENCRYPTION_PWSALT:
+ case F2FS_IOC_GET_ENCRYPTION_POLICY:
+ case F2FS_IOC_GARBAGE_COLLECT:
+ case F2FS_IOC_WRITE_CHECKPOINT:
+ case F2FS_IOC_DEFRAGMENT:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index fedbf67a0842..b0051a97824c 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -16,7 +16,6 @@
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/freezer.h>
-#include <linux/blkdev.h>
#include "f2fs.h"
#include "node.h"
@@ -173,9 +172,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
{
/* SSR allocates in a segment unit */
if (p->alloc_mode == SSR)
- return 1 << sbi->log_blocks_per_seg;
+ return sbi->blocks_per_seg;
if (p->gc_mode == GC_GREEDY)
- return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
+ return sbi->blocks_per_seg * p->ofs_unit;
else if (p->gc_mode == GC_CB)
return UINT_MAX;
else /* No other gc_mode */
@@ -246,6 +245,18 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
return get_cb_cost(sbi, segno);
}
+static unsigned int count_bits(const unsigned long *addr,
+ unsigned int offset, unsigned int len)
+{
+ unsigned int end = offset + len, sum = 0;
+
+ while (offset < end) {
+ if (test_bit(offset++, addr))
+ ++sum;
+ }
+ return sum;
+}
+
/*
* This function is called from two paths.
* One is garbage collection and the other is SSR segment selection.
@@ -259,9 +270,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
- unsigned int secno, max_cost;
+ unsigned int secno, max_cost, last_victim;
unsigned int last_segment = MAIN_SEGS(sbi);
- int nsearched = 0;
+ unsigned int nsearched = 0;
mutex_lock(&dirty_i->seglist_lock);
@@ -274,6 +285,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
if (p.max_search == 0)
goto out;
+ last_victim = sbi->last_victim[p.gc_mode];
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -296,27 +308,35 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
}
p.offset = segno + p.ofs_unit;
- if (p.ofs_unit > 1)
+ if (p.ofs_unit > 1) {
p.offset -= segno % p.ofs_unit;
+ nsearched += count_bits(p.dirty_segmap,
+ p.offset - p.ofs_unit,
+ p.ofs_unit);
+ } else {
+ nsearched++;
+ }
+
secno = GET_SECNO(sbi, segno);
if (sec_usage_check(sbi, secno))
- continue;
+ goto next;
if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
- continue;
+ goto next;
cost = get_gc_cost(sbi, segno, &p);
if (p.min_cost > cost) {
p.min_segno = segno;
p.min_cost = cost;
- } else if (unlikely(cost == max_cost)) {
- continue;
}
-
- if (nsearched++ >= p.max_search) {
- sbi->last_victim[p.gc_mode] = segno;
+next:
+ if (nsearched >= p.max_search) {
+ if (!sbi->last_victim[p.gc_mode] && segno <= last_victim)
+ sbi->last_victim[p.gc_mode] = last_victim + 1;
+ else
+ sbi->last_victim[p.gc_mode] = segno + 1;
break;
}
}
@@ -400,7 +420,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
* On validity, copy that node with cold status, otherwise (invalid node)
* ignore that.
*/
-static int gc_node_segment(struct f2fs_sb_info *sbi,
+static void gc_node_segment(struct f2fs_sb_info *sbi,
struct f2fs_summary *sum, unsigned int segno, int gc_type)
{
bool initial = true;
@@ -420,7 +440,7 @@ next_step:
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
- return 0;
+ return;
if (check_valid_map(sbi, segno, off) == 0)
continue;
@@ -447,7 +467,7 @@ next_step:
/* set page dirty and write it */
if (gc_type == FG_GC) {
- f2fs_wait_on_page_writeback(node_page, NODE);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
set_page_dirty(node_page);
} else {
if (!PageWriteback(node_page))
@@ -461,20 +481,6 @@ next_step:
initial = false;
goto next_step;
}
-
- if (gc_type == FG_GC) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .for_reclaim = 0,
- };
- sync_node_pages(sbi, 0, &wbc);
-
- /* return 1 only if FG_GC succefully reclaimed one */
- if (get_valid_blocks(sbi, segno, 1) == 0)
- return 1;
- }
- return 0;
}
/*
@@ -484,7 +490,7 @@ next_step:
* as indirect or double indirect node blocks, are given, it must be a caller's
* bug.
*/
-block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
+block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
{
unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
unsigned int bidx;
@@ -501,7 +507,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi)
int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
bidx = node_ofs - 5 - dec;
}
- return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi);
+ return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode);
}
static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -547,6 +553,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
struct f2fs_summary sum;
struct node_info ni;
struct page *page;
+ block_t newaddr;
int err;
/* do not read out */
@@ -568,21 +575,24 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
* don't cache encrypted data into meta inode until previous dirty
* data were writebacked to avoid racing between GC and flush.
*/
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
get_node_info(fio.sbi, dn.nid, &ni);
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
/* read page */
fio.page = page;
- fio.blk_addr = dn.data_blkaddr;
+ fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
- fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi),
- fio.blk_addr,
- FGP_LOCK|FGP_CREAT,
- GFP_NOFS);
- if (!fio.encrypted_page)
- goto put_out;
+ allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
+ &sum, CURSEG_COLD_DATA);
+
+ fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
+ FGP_LOCK | FGP_CREAT, GFP_NOFS);
+ if (!fio.encrypted_page) {
+ err = -ENOMEM;
+ goto recover_block;
+ }
err = f2fs_submit_page_bio(&fio);
if (err)
@@ -591,33 +601,39 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
/* write page */
lock_page(fio.encrypted_page);
- if (unlikely(!PageUptodate(fio.encrypted_page)))
+ if (unlikely(!PageUptodate(fio.encrypted_page))) {
+ err = -EIO;
goto put_page_out;
- if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi)))
+ }
+ if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) {
+ err = -EIO;
goto put_page_out;
+ }
set_page_dirty(fio.encrypted_page);
- f2fs_wait_on_page_writeback(fio.encrypted_page, DATA);
+ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true);
if (clear_page_dirty_for_io(fio.encrypted_page))
dec_page_count(fio.sbi, F2FS_DIRTY_META);
set_page_writeback(fio.encrypted_page);
/* allocate block address */
- f2fs_wait_on_page_writeback(dn.node_page, NODE);
- allocate_data_block(fio.sbi, NULL, fio.blk_addr,
- &fio.blk_addr, &sum, CURSEG_COLD_DATA);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
+
fio.rw = WRITE_SYNC;
+ fio.new_blkaddr = newaddr;
f2fs_submit_page_mbio(&fio);
- dn.data_blkaddr = fio.blk_addr;
- set_data_blkaddr(&dn);
- f2fs_update_extent_cache(&dn);
+ f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
if (page->index == 0)
set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
put_page_out:
f2fs_put_page(fio.encrypted_page, 1);
+recover_block:
+ if (err)
+ __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
+ true, true);
put_out:
f2fs_put_dnode(&dn);
out:
@@ -646,7 +662,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
.encrypted_page = NULL,
};
set_page_dirty(page);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
if (clear_page_dirty_for_io(page))
inode_dec_dirty_pages(inode);
set_cold_data(page);
@@ -664,7 +680,7 @@ out:
* If the parent node is not valid or the data block address is different,
* the victim data block is ignored.
*/
-static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct gc_inode_list *gc_list, unsigned int segno, int gc_type)
{
struct super_block *sb = sbi->sb;
@@ -687,7 +703,7 @@ next_step:
/* stop BG_GC if there is not enough free sections. */
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0))
- return 0;
+ return;
if (check_valid_map(sbi, segno, off) == 0)
continue;
@@ -720,7 +736,7 @@ next_step:
continue;
}
- start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
+ start_bidx = start_bidx_of_node(nofs, inode);
data_page = get_read_data_page(inode,
start_bidx + ofs_in_node, READA, true);
if (IS_ERR(data_page)) {
@@ -736,7 +752,7 @@ next_step:
/* phase 3 */
inode = find_gc_inode(gc_list, dni.ino);
if (inode) {
- start_bidx = start_bidx_of_node(nofs, F2FS_I(inode))
+ start_bidx = start_bidx_of_node(nofs, inode)
+ ofs_in_node;
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
move_encrypted_block(inode, start_bidx);
@@ -748,15 +764,6 @@ next_step:
if (++phase < 4)
goto next_step;
-
- if (gc_type == FG_GC) {
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
-
- /* return 1 only if FG_GC succefully reclaimed one */
- if (get_valid_blocks(sbi, segno, 1) == 0)
- return 1;
- }
- return 0;
}
static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
@@ -772,53 +779,92 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
return ret;
}
-static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+static int do_garbage_collect(struct f2fs_sb_info *sbi,
+ unsigned int start_segno,
struct gc_inode_list *gc_list, int gc_type)
{
struct page *sum_page;
struct f2fs_summary_block *sum;
struct blk_plug plug;
- int nfree = 0;
+ unsigned int segno = start_segno;
+ unsigned int end_segno = start_segno + sbi->segs_per_sec;
+ int seg_freed = 0;
+ unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
+ SUM_TYPE_DATA : SUM_TYPE_NODE;
- /* read segment summary of victim */
- sum_page = get_sum_page(sbi, segno);
+ /* readahead multi ssa blocks those have contiguous address */
+ if (sbi->segs_per_sec > 1)
+ ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
+ sbi->segs_per_sec, META_SSA, true);
+
+ /* reference all summary page */
+ while (segno < end_segno) {
+ sum_page = get_sum_page(sbi, segno++);
+ unlock_page(sum_page);
+ }
blk_start_plug(&plug);
- sum = page_address(sum_page);
+ for (segno = start_segno; segno < end_segno; segno++) {
+ /* find segment summary of victim */
+ sum_page = find_get_page(META_MAPPING(sbi),
+ GET_SUM_BLOCK(sbi, segno));
+ f2fs_bug_on(sbi, !PageUptodate(sum_page));
+ f2fs_put_page(sum_page, 0);
- /*
- * this is to avoid deadlock:
- * - lock_page(sum_page) - f2fs_replace_block
- * - check_valid_map() - mutex_lock(sentry_lock)
- * - mutex_lock(sentry_lock) - change_curseg()
- * - lock_page(sum_page)
- */
- unlock_page(sum_page);
-
- switch (GET_SUM_TYPE((&sum->footer))) {
- case SUM_TYPE_NODE:
- nfree = gc_node_segment(sbi, sum->entries, segno, gc_type);
- break;
- case SUM_TYPE_DATA:
- nfree = gc_data_segment(sbi, sum->entries, gc_list,
- segno, gc_type);
- break;
+ sum = page_address(sum_page);
+ f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer)));
+
+ /*
+ * this is to avoid deadlock:
+ * - lock_page(sum_page) - f2fs_replace_block
+ * - check_valid_map() - mutex_lock(sentry_lock)
+ * - mutex_lock(sentry_lock) - change_curseg()
+ * - lock_page(sum_page)
+ */
+
+ if (type == SUM_TYPE_NODE)
+ gc_node_segment(sbi, sum->entries, segno, gc_type);
+ else
+ gc_data_segment(sbi, sum->entries, gc_list, segno,
+ gc_type);
+
+ stat_inc_seg_count(sbi, type, gc_type);
+
+ f2fs_put_page(sum_page, 0);
+ }
+
+ if (gc_type == FG_GC) {
+ if (type == SUM_TYPE_NODE) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .for_reclaim = 0,
+ };
+ sync_node_pages(sbi, 0, &wbc);
+ } else {
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ }
}
+
blk_finish_plug(&plug);
- stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type);
+ if (gc_type == FG_GC) {
+ while (start_segno < end_segno)
+ if (get_valid_blocks(sbi, start_segno++, 1) == 0)
+ seg_freed++;
+ }
+
stat_inc_call_count(sbi->stat_info);
- f2fs_put_page(sum_page, 0);
- return nfree;
+ return seg_freed;
}
int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
{
- unsigned int segno, i;
+ unsigned int segno;
int gc_type = sync ? FG_GC : BG_GC;
- int sec_freed = 0;
+ int sec_freed = 0, seg_freed;
int ret = -EINVAL;
struct cp_control cpc;
struct gc_inode_list gc_list = {
@@ -832,35 +878,31 @@ gc_more:
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ ret = -EIO;
goto stop;
+ }
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
gc_type = FG_GC;
+ /*
+ * If there is no victim and no prefree segment but still not
+ * enough free sections, we should flush dent/node blocks and do
+ * garbage collections.
+ */
if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
write_checkpoint(sbi, &cpc);
+ else if (has_not_enough_free_secs(sbi, 0))
+ write_checkpoint(sbi, &cpc);
}
if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
goto stop;
ret = 0;
- /* readahead multi ssa blocks those have contiguous address */
- if (sbi->segs_per_sec > 1)
- ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
- META_SSA, true);
-
- for (i = 0; i < sbi->segs_per_sec; i++) {
- /*
- * for FG_GC case, halt gcing left segments once failed one
- * of segments in selected section to avoid long latency.
- */
- if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) &&
- gc_type == FG_GC)
- break;
- }
+ seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
- if (i == sbi->segs_per_sec && gc_type == FG_GC)
+ if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
sec_freed++;
if (gc_type == FG_GC)
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index b4a65be9f7d3..a993967dcdb9 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
return true;
return false;
}
-
-static inline int is_idle(struct f2fs_sb_info *sbi)
-{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- struct request_list *rl = &q->root_rl;
- return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
-}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index bda7126466c0..a2fbe6f427d3 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -16,9 +16,6 @@
bool f2fs_may_inline_data(struct inode *inode)
{
- if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
- return false;
-
if (f2fs_is_atomic_file(inode))
return false;
@@ -54,7 +51,7 @@ void read_inline_data(struct page *page, struct page *ipage)
f2fs_bug_on(F2FS_P_SB(page), page->index);
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+ zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
/* Copy the whole inline data block */
src_addr = inline_data_addr(ipage);
@@ -74,7 +71,7 @@ bool truncate_inline_inode(struct page *ipage, u64 from)
addr = inline_data_addr(ipage);
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
memset(addr + from, 0, MAX_INLINE_DATA - from);
return true;
@@ -96,7 +93,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
}
if (page->index)
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
else
read_inline_data(page, ipage);
@@ -108,7 +105,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
{
- void *src_addr, *dst_addr;
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(dn->inode),
.type = DATA,
@@ -118,8 +114,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
};
int dirty, err;
- f2fs_bug_on(F2FS_I_SB(dn->inode), page->index);
-
if (!f2fs_exist_data(dn->inode))
goto clear_out;
@@ -127,21 +121,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
if (err)
return err;
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page));
- if (PageUptodate(page))
- goto no_update;
-
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
-
- /* Copy the whole inline data block */
- src_addr = inline_data_addr(dn->inode_page);
- dst_addr = kmap_atomic(page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
- flush_dcache_page(page);
- kunmap_atomic(dst_addr);
- SetPageUptodate(page);
-no_update:
+ read_inline_data(page, dn->inode_page);
set_page_dirty(page);
/* clear dirty state */
@@ -149,11 +131,9 @@ no_update:
/* write data page to try to make data consistent */
set_page_writeback(page);
- fio.blk_addr = dn->data_blkaddr;
+ fio.old_blkaddr = dn->data_blkaddr;
write_data_page(dn, &fio);
- set_data_blkaddr(dn);
- f2fs_update_extent_cache(dn);
- f2fs_wait_on_page_writeback(page, DATA);
+ f2fs_wait_on_page_writeback(page, DATA, true);
if (dirty)
inode_dec_dirty_pages(dn->inode);
@@ -162,6 +142,7 @@ no_update:
/* clear inline data and flag after data writeback */
truncate_inline_inode(dn->inode_page, 0);
+ clear_inline_node(dn->inode_page);
clear_out:
stat_dec_inline_inode(dn->inode);
f2fs_clear_inline_inode(dn->inode);
@@ -177,6 +158,9 @@ int f2fs_convert_inline_inode(struct inode *inode)
struct page *ipage, *page;
int err = 0;
+ if (!f2fs_has_inline_data(inode))
+ return 0;
+
page = grab_cache_page(inode->i_mapping, 0);
if (!page)
return -ENOMEM;
@@ -199,6 +183,9 @@ out:
f2fs_unlock_op(sbi);
f2fs_put_page(page, 1);
+
+ f2fs_balance_fs(sbi, dn.node_changed);
+
return err;
}
@@ -220,7 +207,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
f2fs_bug_on(F2FS_I_SB(inode), page->index);
- f2fs_wait_on_page_writeback(dn.inode_page, NODE);
+ f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
src_addr = kmap_atomic(page);
dst_addr = inline_data_addr(dn.inode_page);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
@@ -230,6 +217,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
sync_inode_page(&dn);
+ clear_inline_node(dn.inode_page);
f2fs_put_dnode(&dn);
return 0;
}
@@ -258,7 +246,7 @@ process_inline:
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
src_addr = inline_data_addr(npage);
dst_addr = inline_data_addr(ipage);
@@ -289,7 +277,7 @@ process_inline:
}
struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
- struct f2fs_filename *fname, struct page **res_page)
+ struct fscrypt_name *fname, struct page **res_page)
{
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct f2fs_inline_dentry *inline_dentry;
@@ -386,8 +374,8 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
if (err)
goto out;
- f2fs_wait_on_page_writeback(page, DATA);
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+ f2fs_wait_on_page_writeback(page, DATA, true);
+ zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
dentry_blk = kmap_atomic(page);
@@ -417,8 +405,8 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
stat_dec_inline_dir(dir);
clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
- if (i_size_read(dir) < PAGE_CACHE_SIZE) {
- i_size_write(dir, PAGE_CACHE_SIZE);
+ if (i_size_read(dir) < PAGE_SIZE) {
+ i_size_write(dir, PAGE_SIZE);
set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
@@ -466,7 +454,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
}
}
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
name_hash = f2fs_dentry_hash(name);
make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2);
@@ -504,7 +492,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
int i;
lock_page(page);
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
inline_dentry = inline_data_addr(page);
bit_pos = dentry - inline_dentry->dentry;
@@ -547,7 +535,7 @@ bool f2fs_empty_inline_dir(struct inode *dir)
}
int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
- struct f2fs_str *fstr)
+ struct fscrypt_str *fstr)
{
struct inode *inode = file_inode(file);
struct f2fs_inline_dentry *inline_dentry = NULL;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 97e20decacb4..cb269c46ac25 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -83,7 +83,7 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
while (start < end) {
if (*start++) {
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
@@ -138,7 +138,8 @@ static int do_read_inode(struct inode *inode)
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
- f2fs_init_extent_tree(inode, &ri->i_ext);
+ if (f2fs_init_extent_tree(inode, &ri->i_ext))
+ set_page_dirty(node_page);
get_inline_info(fi, ri);
@@ -202,6 +203,7 @@ make_now:
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
@@ -221,11 +223,11 @@ bad_inode:
return ERR_PTR(ret);
}
-void update_inode(struct inode *inode, struct page *node_page)
+int update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
- f2fs_wait_on_page_writeback(node_page, NODE);
+ f2fs_wait_on_page_writeback(node_page, NODE, true);
ri = F2FS_INODE(node_page);
@@ -259,15 +261,20 @@ void update_inode(struct inode *inode, struct page *node_page)
__set_inode_rdev(inode, ri);
set_cold_node(inode, node_page);
- set_page_dirty(node_page);
-
clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+
+ /* deleted inode */
+ if (inode->i_nlink == 0)
+ clear_inline_node(node_page);
+
+ return set_page_dirty(node_page);
}
-void update_inode_page(struct inode *inode)
+int update_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *node_page;
+ int ret = 0;
retry:
node_page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(node_page)) {
@@ -278,10 +285,11 @@ retry:
} else if (err != -ENOENT) {
f2fs_stop_checkpoint(sbi);
}
- return;
+ return 0;
}
- update_inode(inode, node_page);
+ ret = update_inode(inode, node_page);
f2fs_put_page(node_page, 1);
+ return ret;
}
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -299,9 +307,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
* We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- update_inode_page(inode);
-
- f2fs_balance_fs(sbi);
+ if (update_inode_page(inode))
+ f2fs_balance_fs(sbi, true);
return 0;
}
@@ -317,7 +324,7 @@ void f2fs_evict_inode(struct inode *inode)
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- commit_inmem_pages(inode, true);
+ drop_inmem_pages(inode);
trace_f2fs_evict_inode(inode);
truncate_inode_pages_final(&inode->i_data);
@@ -327,7 +334,7 @@ void f2fs_evict_inode(struct inode *inode)
goto out_clear;
f2fs_bug_on(sbi, get_dirty_pages(inode));
- remove_dirty_dir_inode(inode);
+ remove_dirty_inode(inode);
f2fs_destroy_extent_tree(inode);
@@ -357,9 +364,9 @@ no_delete:
if (xnid)
invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
if (is_inode_flag_set(fi, FI_APPEND_WRITE))
- add_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+ add_ino_entry(sbi, inode->i_ino, APPEND_INO);
if (is_inode_flag_set(fi, FI_UPDATE_WRITE))
- add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+ add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
if (is_inode_flag_set(fi, FI_FREE_NID)) {
if (err && err != -ENOENT)
alloc_nid_done(sbi, inode->i_ino);
@@ -382,10 +389,7 @@ no_delete:
}
}
out_clear:
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- if (fi->i_crypt_info)
- f2fs_free_encryption_info(inode, fi->i_crypt_info);
-#endif
+ fscrypt_put_encryption_info(inode, NULL);
clear_inode(inode);
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e48b80c49090..013e57932d61 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -60,7 +60,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
f2fs_set_encrypted_inode(inode);
- if (f2fs_may_inline_data(inode))
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
@@ -128,8 +128,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
nid_t ino = 0;
int err;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -142,6 +140,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -169,10 +169,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
int err;
if (f2fs_encrypted_inode(dir) &&
- !f2fs_is_child_context_consistent_with_parent(dir, inode))
+ !fscrypt_has_permitted_context(dir, inode))
return -EPERM;
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
inode->i_ctime = CURRENT_TIME;
ihold(inode);
@@ -214,6 +214,15 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
struct page *page;
int err = 0;
+ if (f2fs_readonly(sbi->sb)) {
+ f2fs_msg(sbi->sb, KERN_INFO,
+ "skip recovering inline_dots inode (ino:%lu, pino:%u) "
+ "in readonly mountpoint", dir->i_ino, pino);
+ return 0;
+ }
+
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
de = f2fs_find_entry(dir, &dot, &page);
@@ -251,6 +260,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
struct page *page;
nid_t ino;
int err = 0;
+ unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
+
+ if (f2fs_encrypted_inode(dir)) {
+ int res = fscrypt_get_encryption_info(dir);
+
+ /*
+ * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
+ * created while the directory was encrypted and we
+ * don't have access to the key.
+ */
+ if (fscrypt_has_encryption_key(dir))
+ fscrypt_set_encrypted_dentry(dentry);
+ fscrypt_set_d_op(dentry);
+ if (res && res != -ENOKEY)
+ return ERR_PTR(res);
+ }
if (dentry->d_name.len > F2FS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -267,15 +292,29 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
return ERR_CAST(inode);
+ if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
+ err = __recover_dot_dentries(dir, root_ino);
+ if (err)
+ goto err_out;
+ }
+
if (f2fs_has_inline_dots(inode)) {
err = __recover_dot_dentries(inode, dir->i_ino);
if (err)
goto err_out;
}
+ if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) &&
+ (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
+ !fscrypt_has_permitted_context(dir, inode)) {
+ bool nokey = f2fs_encrypted_inode(inode) &&
+ !fscrypt_has_encryption_key(inode);
+ err = nokey ? -ENOKEY : -EPERM;
+ goto err_out;
+ }
return d_splice_alias(inode, dentry);
err_out:
- iget_failed(inode);
+ iput(inode);
return ERR_PTR(err);
}
@@ -288,12 +327,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
int err = -ENOENT;
trace_f2fs_unlink_enter(dir, dentry);
- f2fs_balance_fs(sbi);
de = f2fs_find_entry(dir, &dentry->d_name, &page);
if (!de)
goto fail;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err) {
@@ -315,12 +355,15 @@ fail:
return err;
}
-static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- const char *link = page_follow_link_light(dentry, cookie);
+ const char *link = page_get_link(dentry, inode, done);
if (!IS_ERR(link) && !*link) {
/* this is broken symlink case */
- page_put_link(NULL, *cookie);
+ do_delayed_call(done);
+ clear_delayed_call(done);
link = ERR_PTR(-ENOENT);
}
return link;
@@ -332,16 +375,24 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
size_t len = strlen(symname);
- size_t p_len;
- char *p_str;
- struct f2fs_str disk_link = FSTR_INIT(NULL, 0);
- struct f2fs_encrypted_symlink_data *sd = NULL;
+ struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
+ struct fscrypt_symlink_data *sd = NULL;
int err;
- if (len > dir->i_sb->s_blocksize)
- return -ENAMETOOLONG;
+ if (f2fs_encrypted_inode(dir)) {
+ err = fscrypt_get_encryption_info(dir);
+ if (err)
+ return err;
- f2fs_balance_fs(sbi);
+ if (!fscrypt_has_encryption_key(dir))
+ return -EPERM;
+
+ disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
+ sizeof(struct fscrypt_symlink_data));
+ }
+
+ if (disk_link.len > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode))
@@ -351,8 +402,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &f2fs_encrypted_symlink_inode_operations;
else
inode->i_op = &f2fs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -360,42 +414,36 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
f2fs_unlock_op(sbi);
alloc_nid_done(sbi, inode->i_ino);
- if (f2fs_encrypted_inode(dir)) {
+ if (f2fs_encrypted_inode(inode)) {
struct qstr istr = QSTR_INIT(symname, len);
+ struct fscrypt_str ostr;
- err = f2fs_get_encryption_info(inode);
- if (err)
+ sd = kzalloc(disk_link.len, GFP_NOFS);
+ if (!sd) {
+ err = -ENOMEM;
goto err_out;
+ }
- err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link);
+ err = fscrypt_get_encryption_info(inode);
if (err)
goto err_out;
- err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link);
- if (err < 0)
- goto err_out;
-
- p_len = encrypted_symlink_data_len(disk_link.len) + 1;
-
- if (p_len > dir->i_sb->s_blocksize) {
- err = -ENAMETOOLONG;
+ if (!fscrypt_has_encryption_key(inode)) {
+ err = -EPERM;
goto err_out;
}
- sd = kzalloc(p_len, GFP_NOFS);
- if (!sd) {
- err = -ENOMEM;
+ ostr.name = sd->encrypted_path;
+ ostr.len = disk_link.len;
+ err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
+ if (err < 0)
goto err_out;
- }
- memcpy(sd->encrypted_path, disk_link.name, disk_link.len);
- sd->len = cpu_to_le16(disk_link.len);
- p_str = (char *)sd;
- } else {
- p_len = len + 1;
- p_str = (char *)symname;
+
+ sd->len = cpu_to_le16(ostr.len);
+ disk_link.name = (char *)sd;
}
- err = page_symlink(inode, p_str, p_len);
+ err = page_symlink(inode, disk_link.name, disk_link.len);
err_out:
d_instantiate(dentry, inode);
@@ -411,7 +459,8 @@ err_out:
* performance regression.
*/
if (!err) {
- filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1);
+ filemap_write_and_wait_range(inode->i_mapping, 0,
+ disk_link.len - 1);
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
@@ -420,7 +469,6 @@ err_out:
}
kfree(sd);
- f2fs_fname_crypto_free_buffer(&disk_link);
return err;
out:
handle_failed_inode(inode);
@@ -433,8 +481,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
int err;
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -444,6 +490,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_mapping->a_ops = &f2fs_dblock_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
+ f2fs_balance_fs(sbi, true);
+
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
@@ -481,11 +529,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -493,6 +536,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &f2fs_special_inode_operations;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
if (err)
@@ -519,9 +564,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err;
- if (!whiteout)
- f2fs_balance_fs(sbi);
-
inode = f2fs_new_inode(dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -535,6 +577,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
}
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err)
@@ -572,7 +616,7 @@ out:
static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
if (f2fs_encrypted_inode(dir)) {
- int err = f2fs_get_encryption_info(dir);
+ int err = fscrypt_get_encryption_info(dir);
if (err)
return err;
}
@@ -598,17 +642,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
+ bool is_old_inline = f2fs_has_inline_dentry(old_dir);
int err = -ENOENT;
if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
- !f2fs_is_child_context_consistent_with_parent(new_dir,
- old_inode)) {
+ !fscrypt_has_permitted_context(new_dir, old_inode)) {
err = -EPERM;
goto out;
}
- f2fs_balance_fs(sbi);
-
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_entry)
goto out;
@@ -638,14 +680,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!new_entry)
goto out_whiteout;
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err)
goto put_out_dir;
- if (update_dent_inode(old_inode, new_inode,
- &new_dentry->d_name)) {
+ err = update_dent_inode(old_inode, new_inode,
+ &new_dentry->d_name);
+ if (err) {
release_orphan_inode(sbi);
goto put_out_dir;
}
@@ -669,6 +714,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
update_inode_page(old_inode);
update_inode_page(new_inode);
} else {
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = f2fs_add_link(new_dentry, old_inode);
@@ -681,6 +728,26 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
inc_nlink(new_dir);
update_inode_page(new_dir);
}
+
+ /*
+ * old entry and new entry can locate in the same inline
+ * dentry in inode, when attaching new entry in inline dentry,
+ * it could force inline dentry conversion, after that,
+ * old_entry and old_page will point to wrong address, in
+ * order to avoid this, let's do the check and update here.
+ */
+ if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) {
+ f2fs_put_page(old_page, 0);
+ old_page = NULL;
+
+ old_entry = f2fs_find_entry(old_dir,
+ &old_dentry->d_name, &old_page);
+ if (!old_entry) {
+ err = -EIO;
+ f2fs_unlock_op(sbi);
+ goto out_whiteout;
+ }
+ }
}
down_write(&F2FS_I(old_inode)->i_sem);
@@ -759,15 +826,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
int err = -ENOENT;
if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
- (old_dir != new_dir) &&
- (!f2fs_is_child_context_consistent_with_parent(new_dir,
- old_inode) ||
- !f2fs_is_child_context_consistent_with_parent(old_dir,
- new_inode)))
+ (old_dir != new_dir) &&
+ (!fscrypt_has_permitted_context(new_dir, old_inode) ||
+ !fscrypt_has_permitted_context(old_dir, new_inode)))
return -EPERM;
- f2fs_balance_fs(sbi);
-
old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_entry)
goto out;
@@ -810,6 +873,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_new_dir;
}
+ f2fs_balance_fs(sbi, true);
+
f2fs_lock_op(sbi);
err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
@@ -925,89 +990,88 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
}
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct page *cpage = NULL;
char *caddr, *paddr = NULL;
- struct f2fs_str cstr;
- struct f2fs_str pstr = FSTR_INIT(NULL, 0);
- struct inode *inode = d_inode(dentry);
- struct f2fs_encrypted_symlink_data *sd;
+ struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
+ struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
+ struct fscrypt_symlink_data *sd;
loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
u32 max_size = inode->i_sb->s_blocksize;
int res;
- res = f2fs_get_encryption_info(inode);
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ res = fscrypt_get_encryption_info(inode);
if (res)
return ERR_PTR(res);
cpage = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(cpage))
return ERR_CAST(cpage);
- caddr = kmap(cpage);
+ caddr = page_address(cpage);
caddr[size] = 0;
/* Symlink is encrypted */
- sd = (struct f2fs_encrypted_symlink_data *)caddr;
+ sd = (struct fscrypt_symlink_data *)caddr;
+ cstr.name = sd->encrypted_path;
cstr.len = le16_to_cpu(sd->len);
- cstr.name = kmalloc(cstr.len, GFP_NOFS);
- if (!cstr.name) {
- res = -ENOMEM;
- goto errout;
- }
- memcpy(cstr.name, sd->encrypted_path, cstr.len);
/* this is broken symlink case */
- if (cstr.name[0] == 0 && cstr.len == 0) {
+ if (unlikely(cstr.len == 0)) {
res = -ENOENT;
goto errout;
}
- if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) >
- max_size) {
+ if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
/* Symlink data on the disk is corrupted */
res = -EIO;
goto errout;
}
- res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr);
+ res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
if (res)
goto errout;
- res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
+ res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
if (res < 0)
goto errout;
- kfree(cstr.name);
+ /* this is broken symlink case */
+ if (unlikely(pstr.name[0] == 0)) {
+ res = -ENOENT;
+ goto errout;
+ }
paddr = pstr.name;
/* Null-terminate the name */
paddr[res] = '\0';
- kunmap(cpage);
- page_cache_release(cpage);
- return *cookie = paddr;
+ put_page(cpage);
+ set_delayed_call(done, kfree_link, paddr);
+ return paddr;
errout:
- kfree(cstr.name);
- f2fs_fname_crypto_free_buffer(&pstr);
- kunmap(cpage);
- page_cache_release(cpage);
+ fscrypt_fname_free_buffer(&pstr);
+ put_page(cpage);
return ERR_PTR(res);
}
const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = f2fs_encrypted_follow_link,
- .put_link = kfree_put_link,
+ .get_link = f2fs_encrypted_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = f2fs_listxattr,
.removexattr = generic_removexattr,
-};
#endif
+};
const struct inode_operations f2fs_dir_inode_operations = {
.create = f2fs_create,
@@ -1034,8 +1098,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
const struct inode_operations f2fs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = f2fs_follow_link,
- .put_link = page_put_link,
+ .get_link = f2fs_get_link,
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
#ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 7bcbc6e9c40d..1a33de9d84b1 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -46,11 +46,11 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
*/
if (type == FREE_NIDS) {
mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == NAT_ENTRIES) {
mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == DIRTY_DENTS) {
if (sbi->sb->s_bdi->wb.dirty_exceeded)
@@ -62,16 +62,17 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
for (i = 0; i <= UPDATE_INO; i++)
mem_size += (sbi->im[i].ino_num *
- sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
+ sizeof(struct ino_entry)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else if (type == EXTENT_CACHE) {
- mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) +
+ mem_size = (atomic_read(&sbi->total_ext_tree) *
+ sizeof(struct extent_tree) +
atomic_read(&sbi->total_ext_node) *
- sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
+ sizeof(struct extent_node)) >> PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
} else {
- if (sbi->sb->s_bdi->wb.dirty_exceeded)
- return false;
+ if (!sbi->sb->s_bdi->wb.dirty_exceeded)
+ return true;
}
return res;
}
@@ -120,7 +121,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
- memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+ memcpy(dst_addr, src_addr, PAGE_SIZE);
set_page_dirty(dst_page);
f2fs_put_page(src_page, 1);
@@ -256,18 +257,21 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
return new;
}
-static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
struct f2fs_nat_entry *ne)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (!e) {
e = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&e->ni, ne);
+ } else {
+ f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
+ nat_get_blkaddr(e) != ne->block_addr ||
+ nat_get_version(e) != ne->version);
}
- up_write(&nm_i->nat_tree_lock);
}
static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -355,7 +359,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
nid_t start_nid = START_NID(nid);
struct f2fs_nat_block *nat_blk;
struct page *page = NULL;
@@ -372,21 +376,20 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
- }
- up_read(&nm_i->nat_tree_lock);
- if (e)
+ up_read(&nm_i->nat_tree_lock);
return;
+ }
memset(&ne, 0, sizeof(struct f2fs_nat_entry));
/* Check current segment summary */
- mutex_lock(&curseg->curseg_mutex);
- i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
+ down_read(&curseg->journal_rwsem);
+ i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
if (i >= 0) {
- ne = nat_in_journal(sum, i);
+ ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
- mutex_unlock(&curseg->curseg_mutex);
+ up_read(&curseg->journal_rwsem);
if (i >= 0)
goto cache;
@@ -397,18 +400,52 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
node_info_from_raw_nat(ni, &ne);
f2fs_put_page(page, 1);
cache:
+ up_read(&nm_i->nat_tree_lock);
/* cache nat entry */
- cache_nat_entry(NM_I(sbi), nid, &ne);
+ down_write(&nm_i->nat_tree_lock);
+ cache_nat_entry(sbi, nid, &ne);
+ up_write(&nm_i->nat_tree_lock);
+}
+
+pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
+{
+ const long direct_index = ADDRS_PER_INODE(dn->inode);
+ const long direct_blks = ADDRS_PER_BLOCK;
+ const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
+ unsigned int skipped_unit = ADDRS_PER_BLOCK;
+ int cur_level = dn->cur_level;
+ int max_level = dn->max_level;
+ pgoff_t base = 0;
+
+ if (!dn->max_level)
+ return pgofs + 1;
+
+ while (max_level-- > cur_level)
+ skipped_unit *= NIDS_PER_BLOCK;
+
+ switch (dn->max_level) {
+ case 3:
+ base += 2 * indirect_blks;
+ case 2:
+ base += 2 * direct_blks;
+ case 1:
+ base += direct_index;
+ break;
+ default:
+ f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
+ }
+
+ return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
}
/*
* The maximum depth is four.
* Offset[0] will have raw inode offset.
*/
-static int get_node_path(struct f2fs_inode_info *fi, long block,
+static int get_node_path(struct inode *inode, long block,
int offset[4], unsigned int noffset[4])
{
- const long direct_index = ADDRS_PER_INODE(fi);
+ const long direct_index = ADDRS_PER_INODE(inode);
const long direct_blks = ADDRS_PER_BLOCK;
const long dptrs_per_blk = NIDS_PER_BLOCK;
const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
@@ -493,10 +530,10 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
int offset[4];
unsigned int noffset[4];
nid_t nids[4];
- int level, i;
+ int level, i = 0;
int err = 0;
- level = get_node_path(F2FS_I(dn->inode), index, offset, noffset);
+ level = get_node_path(dn->inode, index, offset, noffset);
nids[0] = dn->inode->i_ino;
npage[0] = dn->inode_page;
@@ -583,6 +620,10 @@ release_pages:
release_out:
dn->inode_page = NULL;
dn->node_page = NULL;
+ if (err == -ENOENT) {
+ dn->cur_level = i;
+ dn->max_level = level;
+ }
return err;
}
@@ -676,7 +717,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
ret = truncate_dnode(&rdn);
if (ret < 0)
goto out_err;
- set_nid(page, i, 0, false);
+ if (set_nid(page, i, 0, false))
+ dn->node_changed = true;
}
} else {
child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
@@ -689,7 +731,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
rdn.nid = child_nid;
ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
if (ret == (NIDS_PER_BLOCK + 1)) {
- set_nid(page, i, 0, false);
+ if (set_nid(page, i, 0, false))
+ dn->node_changed = true;
child_nofs += ret;
} else if (ret < 0 && ret != -ENOENT) {
goto out_err;
@@ -750,7 +793,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
err = truncate_dnode(dn);
if (err < 0)
goto fail;
- set_nid(pages[idx], i, 0, false);
+ if (set_nid(pages[idx], i, 0, false))
+ dn->node_changed = true;
}
if (offset[idx + 1] == 0) {
@@ -787,7 +831,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
- level = get_node_path(F2FS_I(inode), from, offset, noffset);
+ level = get_node_path(inode, from, offset, noffset);
restart:
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -856,7 +900,7 @@ skip_partial:
f2fs_put_page(page, 1);
goto restart;
}
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
set_page_dirty(page);
unlock_page(page);
@@ -971,11 +1015,12 @@ struct page *new_node_page(struct dnode_of_data *dn,
new_ni.ino = dn->inode->i_ino;
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(dn->inode, page);
SetPageUptodate(page);
- set_page_dirty(page);
+ if (set_page_dirty(page))
+ dn->node_changed = true;
if (f2fs_has_xattr_block(ofs))
F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
@@ -1023,7 +1068,7 @@ static int read_node_page(struct page *page, int rw)
if (PageUptodate(page))
return LOCKED_PAGE;
- fio.blk_addr = ni.blk_addr;
+ fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
return f2fs_submit_page_bio(&fio);
}
@@ -1035,12 +1080,15 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
struct page *apage;
int err;
- apage = find_get_page(NODE_MAPPING(sbi), nid);
- if (apage && PageUptodate(apage)) {
- f2fs_put_page(apage, 0);
+ if (!nid)
+ return;
+ f2fs_bug_on(sbi, check_nid_range(sbi, nid));
+
+ rcu_read_lock();
+ apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
+ rcu_read_unlock();
+ if (apage)
return;
- }
- f2fs_put_page(apage, 0);
apage = grab_cache_page(NODE_MAPPING(sbi), nid);
if (!apage)
@@ -1050,51 +1098,38 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
f2fs_put_page(apage, err ? 1 : 0);
}
-struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+/*
+ * readahead MAX_RA_NODE number of node pages.
+ */
+static void ra_node_pages(struct page *parent, int start)
{
- struct page *page;
- int err;
-repeat:
- page = grab_cache_page(NODE_MAPPING(sbi), nid);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ struct blk_plug plug;
+ int i, end;
+ nid_t nid;
- err = read_node_page(page, READ_SYNC);
- if (err < 0) {
- f2fs_put_page(page, 1);
- return ERR_PTR(err);
- } else if (err != LOCKED_PAGE) {
- lock_page(page);
- }
+ blk_start_plug(&plug);
- if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
- ClearPageUptodate(page);
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
- f2fs_put_page(page, 1);
- goto repeat;
+ /* Then, try readahead for siblings of the desired node */
+ end = start + MAX_RA_NODE;
+ end = min(end, NIDS_PER_BLOCK);
+ for (i = start; i < end; i++) {
+ nid = get_nid(parent, i, false);
+ ra_node_page(sbi, nid);
}
- return page;
+
+ blk_finish_plug(&plug);
}
-/*
- * Return a locked page for the desired node page.
- * And, readahead MAX_RA_NODE number of node pages.
- */
-struct page *get_node_page_ra(struct page *parent, int start)
+static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
+ struct page *parent, int start)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
- struct blk_plug plug;
struct page *page;
- int err, i, end;
- nid_t nid;
+ int err;
- /* First, try getting the desired direct node. */
- nid = get_nid(parent, start, false);
if (!nid)
return ERR_PTR(-ENOENT);
+ f2fs_bug_on(sbi, check_nid_range(sbi, nid));
repeat:
page = grab_cache_page(NODE_MAPPING(sbi), nid);
if (!page)
@@ -1108,46 +1143,91 @@ repeat:
goto page_hit;
}
- blk_start_plug(&plug);
-
- /* Then, try readahead for siblings of the desired node */
- end = start + MAX_RA_NODE;
- end = min(end, NIDS_PER_BLOCK);
- for (i = start + 1; i < end; i++) {
- nid = get_nid(parent, i, false);
- if (!nid)
- continue;
- ra_node_page(sbi, nid);
- }
-
- blk_finish_plug(&plug);
+ if (parent)
+ ra_node_pages(parent, start + 1);
lock_page(page);
+
+ if (unlikely(!PageUptodate(page))) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
f2fs_put_page(page, 1);
goto repeat;
}
page_hit:
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
- }
+ f2fs_bug_on(sbi, nid != nid_of_node(page));
return page;
}
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+{
+ return __get_node_page(sbi, nid, NULL, 0);
+}
+
+struct page *get_node_page_ra(struct page *parent, int start)
+{
+ struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ nid_t nid = get_nid(parent, start, false);
+
+ return __get_node_page(sbi, nid, parent, start);
+}
+
void sync_inode_page(struct dnode_of_data *dn)
{
+ int ret = 0;
+
if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
- update_inode(dn->inode, dn->node_page);
+ ret = update_inode(dn->inode, dn->node_page);
} else if (dn->inode_page) {
if (!dn->inode_page_locked)
lock_page(dn->inode_page);
- update_inode(dn->inode, dn->inode_page);
+ ret = update_inode(dn->inode, dn->inode_page);
if (!dn->inode_page_locked)
unlock_page(dn->inode_page);
} else {
- update_inode_page(dn->inode);
+ ret = update_inode_page(dn->inode);
}
+ dn->node_changed = ret ? true: false;
+}
+
+static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct inode *inode;
+ struct page *page;
+
+ /* should flush inline_data before evict_inode */
+ inode = ilookup(sbi->sb, ino);
+ if (!inode)
+ return;
+
+ page = pagecache_get_page(inode->i_mapping, 0, FGP_NOWAIT, 0);
+ if (!page)
+ goto iput_out;
+
+ if (!trylock_page(page))
+ goto release_out;
+
+ if (!PageUptodate(page))
+ goto page_out;
+
+ if (!PageDirty(page))
+ goto page_out;
+
+ if (!clear_page_dirty_for_io(page))
+ goto page_out;
+
+ if (!f2fs_write_inline_data(inode, page))
+ inode_dec_dirty_pages(inode);
+ else
+ set_page_dirty(page);
+page_out:
+ unlock_page(page);
+release_out:
+ f2fs_put_page(page, 0);
+iput_out:
+ iput(inode);
}
int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
@@ -1156,13 +1236,13 @@ int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
pgoff_t index, end;
struct pagevec pvec;
int step = ino ? 2 : 0;
- int nwritten = 0, wrote = 0;
+ int nwritten = 0;
pagevec_init(&pvec, 0);
next_step:
index = 0;
- end = LONG_MAX;
+ end = ULONG_MAX;
while (index <= end) {
int i, nr_pages;
@@ -1175,6 +1255,11 @@ next_step:
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ if (unlikely(f2fs_cp_error(sbi))) {
+ pagevec_release(&pvec);
+ return -EIO;
+ }
+
/*
* flushing sequence with step:
* 0. indirect nodes
@@ -1194,6 +1279,7 @@ next_step:
* If an fsync mode,
* we should not skip writing node pages.
*/
+lock_node:
if (ino && ino_of_node(page) == ino)
lock_page(page);
else if (!trylock_page(page))
@@ -1212,6 +1298,17 @@ continue_unlock:
goto continue_unlock;
}
+ /* flush inline_data */
+ if (!ino && is_inline_node(page)) {
+ clear_inline_node(page);
+ unlock_page(page);
+ flush_inline_data(sbi, ino_of_node(page));
+ goto lock_node;
+ }
+
+ f2fs_wait_on_page_writeback(page, NODE, true);
+
+ BUG_ON(PageWriteback(page));
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
@@ -1229,8 +1326,6 @@ continue_unlock:
if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
unlock_page(page);
- else
- wrote++;
if (--wbc->nr_to_write == 0)
break;
@@ -1248,15 +1343,12 @@ continue_unlock:
step++;
goto next_step;
}
-
- if (wrote)
- f2fs_submit_merged_bio(sbi, NODE, WRITE);
return nwritten;
}
int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
{
- pgoff_t index = 0, end = LONG_MAX;
+ pgoff_t index = 0, end = ULONG_MAX;
struct pagevec pvec;
int ret2 = 0, ret = 0;
@@ -1278,7 +1370,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
continue;
if (ino && ino_of_node(page) == ino) {
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
if (TestClearPageError(page))
ret = -EIO;
}
@@ -1317,8 +1409,6 @@ static int f2fs_write_node_page(struct page *page,
if (unlikely(f2fs_cp_error(sbi)))
goto redirty_out;
- f2fs_wait_on_page_writeback(page, NODE);
-
/* get old block addr of this node page */
nid = nid_of_node(page);
f2fs_bug_on(sbi, page->index != nid);
@@ -1342,14 +1432,18 @@ static int f2fs_write_node_page(struct page *page,
}
set_page_writeback(page);
- fio.blk_addr = ni.blk_addr;
+ fio.old_blkaddr = ni.blk_addr;
write_node_page(nid, &fio);
- set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
+ set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
up_read(&sbi->node_write);
- unlock_page(page);
if (wbc->for_reclaim)
+ f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE);
+
+ unlock_page(page);
+
+ if (unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_bio(sbi, NODE, WRITE);
return 0;
@@ -1365,8 +1459,6 @@ static int f2fs_write_node_pages(struct address_space *mapping,
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
long diff;
- trace_f2fs_writepages(mapping->host, wbc, NODE);
-
/* balancing f2fs's metadata in background */
f2fs_balance_fs_bg(sbi);
@@ -1374,6 +1466,8 @@ static int f2fs_write_node_pages(struct address_space *mapping,
if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
goto skip_write;
+ trace_f2fs_writepages(mapping->host, wbc, NODE);
+
diff = nr_pages_to_write(sbi, NODE, wbc);
wbc->sync_mode = WB_SYNC_NONE;
sync_node_pages(sbi, 0, wbc);
@@ -1382,6 +1476,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
skip_write:
wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
+ trace_f2fs_writepages(mapping->host, wbc, NODE);
return 0;
}
@@ -1440,13 +1535,10 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
if (build) {
/* do not add allocated nids */
- down_read(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
- if (ne &&
- (!get_nat_flag(ne, IS_CHECKPOINTED) ||
+ if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
nat_get_blkaddr(ne) != NULL_ADDR))
allocated = true;
- up_read(&nm_i->nat_tree_lock);
if (allocated)
return 0;
}
@@ -1520,7 +1612,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int i = 0;
nid_t nid = nm_i->next_scan_nid;
@@ -1532,6 +1624,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
META_NAT, true);
+ down_read(&nm_i->nat_tree_lock);
+
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
@@ -1550,16 +1644,19 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
nm_i->next_scan_nid = nid;
/* find free nids from current sum_pages */
- mutex_lock(&curseg->curseg_mutex);
- for (i = 0; i < nats_in_cursum(sum); i++) {
- block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
- nid = le32_to_cpu(nid_in_journal(sum, i));
+ down_read(&curseg->journal_rwsem);
+ for (i = 0; i < nats_in_cursum(journal); i++) {
+ block_t addr;
+
+ addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+ nid = le32_to_cpu(nid_in_journal(journal, i));
if (addr == NULL_ADDR)
add_free_nid(sbi, nid, true);
else
remove_free_nid(nm_i, nid);
}
- mutex_unlock(&curseg->curseg_mutex);
+ up_read(&curseg->journal_rwsem);
+ up_read(&nm_i->nat_tree_lock);
ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
nm_i->ra_nid_pages, META_NAT, false);
@@ -1582,8 +1679,6 @@ retry:
/* We should not use stale free nids created by build_free_nids */
if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
- struct node_info ni;
-
f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
list_for_each_entry(i, &nm_i->free_nid_list, list)
if (i->state == NID_NEW)
@@ -1594,13 +1689,6 @@ retry:
i->state = NID_ALLOC;
nm_i->fcnt--;
spin_unlock(&nm_i->free_nid_list_lock);
-
- /* check nid is allocated already */
- get_node_info(sbi, *nid, &ni);
- if (ni.blk_addr != NULL_ADDR) {
- alloc_nid_done(sbi, *nid);
- goto retry;
- }
return true;
}
spin_unlock(&nm_i->free_nid_list_lock);
@@ -1703,7 +1791,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
src_addr = inline_xattr_addr(page);
inline_size = inline_xattr_size(inode);
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
memcpy(dst_addr, src_addr, inline_size);
update_inode:
update_inode(inode, ipage);
@@ -1831,28 +1919,26 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int i;
- mutex_lock(&curseg->curseg_mutex);
- for (i = 0; i < nats_in_cursum(sum); i++) {
+ down_write(&curseg->journal_rwsem);
+ for (i = 0; i < nats_in_cursum(journal); i++) {
struct nat_entry *ne;
struct f2fs_nat_entry raw_ne;
- nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
+ nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
- raw_ne = nat_in_journal(sum, i);
+ raw_ne = nat_in_journal(journal, i);
- down_write(&nm_i->nat_tree_lock);
ne = __lookup_nat_cache(nm_i, nid);
if (!ne) {
ne = grab_nat_entry(nm_i, nid);
node_info_from_raw_nat(&ne->ni, &raw_ne);
}
__set_nat_cache_dirty(nm_i, ne);
- up_write(&nm_i->nat_tree_lock);
}
- update_nats_in_cursum(sum, -i);
- mutex_unlock(&curseg->curseg_mutex);
+ update_nats_in_cursum(journal, -i);
+ up_write(&curseg->journal_rwsem);
}
static void __adjust_nat_entry_set(struct nat_entry_set *nes,
@@ -1877,24 +1963,23 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
struct nat_entry_set *set)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
bool to_journal = true;
struct f2fs_nat_block *nat_blk;
struct nat_entry *ne, *cur;
struct page *page = NULL;
- struct f2fs_nm_info *nm_i = NM_I(sbi);
/*
* there are two steps to flush nat entries:
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
- if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
+ if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
to_journal = false;
if (to_journal) {
- mutex_lock(&curseg->curseg_mutex);
+ down_write(&curseg->journal_rwsem);
} else {
page = get_next_nat_page(sbi, start_nid);
nat_blk = page_address(page);
@@ -1911,35 +1996,29 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
continue;
if (to_journal) {
- offset = lookup_journal_in_cursum(sum,
+ offset = lookup_journal_in_cursum(journal,
NAT_JOURNAL, nid, 1);
f2fs_bug_on(sbi, offset < 0);
- raw_ne = &nat_in_journal(sum, offset);
- nid_in_journal(sum, offset) = cpu_to_le32(nid);
+ raw_ne = &nat_in_journal(journal, offset);
+ nid_in_journal(journal, offset) = cpu_to_le32(nid);
} else {
raw_ne = &nat_blk->entries[nid - start_nid];
}
raw_nat_from_node_info(raw_ne, &ne->ni);
-
- down_write(&NM_I(sbi)->nat_tree_lock);
nat_reset_flag(ne);
__clear_nat_cache_dirty(NM_I(sbi), ne);
- up_write(&NM_I(sbi)->nat_tree_lock);
-
if (nat_get_blkaddr(ne) == NULL_ADDR)
add_free_nid(sbi, nid, false);
}
if (to_journal)
- mutex_unlock(&curseg->curseg_mutex);
+ up_write(&curseg->journal_rwsem);
else
f2fs_put_page(page, 1);
f2fs_bug_on(sbi, set->entry_cnt);
- down_write(&nm_i->nat_tree_lock);
radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
- up_write(&nm_i->nat_tree_lock);
kmem_cache_free(nat_entry_set_slab, set);
}
@@ -1950,7 +2029,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
struct nat_entry_set *setvec[SETVEC_SIZE];
struct nat_entry_set *set, *tmp;
unsigned int found;
@@ -1959,29 +2038,32 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
if (!nm_i->dirty_nat_cnt)
return;
+
+ down_write(&nm_i->nat_tree_lock);
+
/*
* if there are no enough space in journal to store dirty nat
* entries, remove all entries from journal and merge them
* into nat entry set.
*/
- if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
+ if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
remove_nats_in_journal(sbi);
- down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_set(nm_i,
set_idx, SETVEC_SIZE, setvec))) {
unsigned idx;
set_idx = setvec[found - 1]->set + 1;
for (idx = 0; idx < found; idx++)
__adjust_nat_entry_set(setvec[idx], &sets,
- MAX_NAT_JENTRIES(sum));
+ MAX_NAT_JENTRIES(journal));
}
- up_write(&nm_i->nat_tree_lock);
/* flush dirty nats in nat entry set */
list_for_each_entry_safe(set, tmp, &sets, set_list)
__flush_nat_entry_set(sbi, set);
+ up_write(&nm_i->nat_tree_lock);
+
f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
}
@@ -2006,6 +2088,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
+ nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index e4fffd2d98c4..1f4f9d4569d9 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -25,6 +25,9 @@
/* control the memory footprint threshold (10MB per 1GB ram) */
#define DEF_RAM_THRESHOLD 10
+/* control dirty nats ratio threshold (default: 10% over max nid count) */
+#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10
+
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
#define SETVEC_SIZE 32
@@ -117,6 +120,12 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
raw_ne->version = ni->version;
}
+static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi)
+{
+ return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid *
+ NM_I(sbi)->dirty_nats_ratio / 100;
+}
+
enum mem_type {
FREE_NIDS, /* indicates the free nid list */
NAT_ENTRIES, /* indicates the cached nat entry */
@@ -183,7 +192,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
block_addr = (pgoff_t)(nm_i->nat_blkaddr +
(seg_off << sbi->log_blocks_per_seg << 1) +
- (block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
+ (block_off & (sbi->blocks_per_seg - 1)));
if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
block_addr += sbi->blocks_per_seg;
@@ -317,17 +326,17 @@ static inline bool IS_DNODE(struct page *node_page)
return true;
}
-static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
+static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
{
struct f2fs_node *rn = F2FS_NODE(p);
- f2fs_wait_on_page_writeback(p, NODE);
+ f2fs_wait_on_page_writeback(p, NODE, true);
if (i)
rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
else
rn->in.nid[off] = cpu_to_le32(nid);
- set_page_dirty(p);
+ return set_page_dirty(p);
}
static inline nid_t get_nid(struct page *p, int off, bool i)
@@ -370,6 +379,21 @@ static inline int is_node(struct page *page, int type)
#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
+static inline int is_inline_node(struct page *page)
+{
+ return PageChecked(page);
+}
+
+static inline void set_inline_node(struct page *page)
+{
+ SetPageChecked(page);
+}
+
+static inline void clear_inline_node(struct page *page)
+{
+ ClearPageChecked(page);
+}
+
static inline void set_cold_node(struct inode *inode, struct page *page)
{
struct f2fs_node *rn = F2FS_NODE(page);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index cbf74f47cce8..011942f94d64 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -168,6 +168,32 @@ static void recover_inode(struct inode *inode, struct page *page)
ino_of_node(page), name);
}
+static bool is_same_inode(struct inode *inode, struct page *ipage)
+{
+ struct f2fs_inode *ri = F2FS_INODE(ipage);
+ struct timespec disk;
+
+ if (!IS_INODE(ipage))
+ return true;
+
+ disk.tv_sec = le64_to_cpu(ri->i_ctime);
+ disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
+ if (timespec_compare(&inode->i_ctime, &disk) > 0)
+ return false;
+
+ disk.tv_sec = le64_to_cpu(ri->i_atime);
+ disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
+ if (timespec_compare(&inode->i_atime, &disk) > 0)
+ return false;
+
+ disk.tv_sec = le64_to_cpu(ri->i_mtime);
+ disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+ if (timespec_compare(&inode->i_mtime, &disk) > 0)
+ return false;
+
+ return true;
+}
+
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
@@ -197,7 +223,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
goto next;
entry = get_fsync_inode(head, ino_of_node(page));
- if (!entry) {
+ if (entry) {
+ if (!is_same_inode(entry->inode, page))
+ goto next;
+ } else {
if (IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
@@ -321,8 +350,7 @@ got_it:
inode = dn->inode;
}
- bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
- le16_to_cpu(sum.ofs_in_node);
+ bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node);
/*
* if inode page is locked, unlock temporarily, but its reference
@@ -357,10 +385,9 @@ truncate_out:
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct page *page, block_t blkaddr)
{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- unsigned int start, end;
struct dnode_of_data dn;
struct node_info ni;
+ unsigned int start, end;
int err = 0, recovered = 0;
/* step 1: recover xattr */
@@ -380,8 +407,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
goto out;
/* step 3: recover data indices */
- start = start_bidx_of_node(ofs_of_node(page), fi);
- end = start + ADDRS_PER_PAGE(page, fi);
+ start = start_bidx_of_node(ofs_of_node(page), inode);
+ end = start + ADDRS_PER_PAGE(page, inode);
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -389,7 +416,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (err)
goto out;
- f2fs_wait_on_page_writeback(dn.node_page, NODE);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
get_node_info(sbi, dn.nid, &ni);
f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
@@ -438,7 +465,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
/* write dummy data page */
f2fs_replace_block(sbi, &dn, src, dest,
- ni.version, false);
+ ni.version, false, false);
recovered++;
}
}
@@ -459,8 +486,7 @@ out:
return err;
}
-static int recover_data(struct f2fs_sb_info *sbi,
- struct list_head *head, int type)
+static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
{
unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
struct curseg_info *curseg;
@@ -469,7 +495,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
block_t blkaddr;
/* get node pages in the current segment */
- curseg = CURSEG_I(sbi, type);
+ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
while (1) {
@@ -556,7 +582,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
need_writecp = true;
/* step #2: recover data */
- err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+ err = recover_data(sbi, &inode_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
out:
@@ -565,7 +591,7 @@ out:
/* truncate meta pages to be used by the recovery */
truncate_inode_pages_range(META_MAPPING(sbi),
- (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+ (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1);
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
@@ -595,7 +621,7 @@ out:
.reason = CP_RECOVERY,
};
mutex_unlock(&sbi->cp_mutex);
- write_checkpoint(sbi, &cpc);
+ err = write_checkpoint(sbi, &cpc);
} else {
mutex_unlock(&sbi->cp_mutex);
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f77b3258454a..540669d6978e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -86,6 +86,7 @@ static inline unsigned long __reverse_ffs(unsigned long word)
/*
* __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
* f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * @size must be integral times of unsigned long.
* Example:
* MSB <--> LSB
* f2fs_set_bit(0, bitmap) => 1000 0000
@@ -95,94 +96,73 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
const unsigned long *p = addr + BIT_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
+ unsigned long result = size;
unsigned long tmp;
if (offset >= size)
return size;
- size -= result;
+ size -= (offset & ~(BITS_PER_LONG - 1));
offset %= BITS_PER_LONG;
- if (!offset)
- goto aligned;
-
- tmp = __reverse_ulong((unsigned char *)p);
- tmp &= ~0UL >> offset;
-
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
-
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- p++;
-aligned:
- while (size & ~(BITS_PER_LONG-1)) {
+
+ while (1) {
+ if (*p == 0)
+ goto pass;
+
tmp = __reverse_ulong((unsigned char *)p);
+
+ tmp &= ~0UL >> offset;
+ if (size < BITS_PER_LONG)
+ tmp &= (~0UL << (BITS_PER_LONG - size));
if (tmp)
- goto found_middle;
- result += BITS_PER_LONG;
+ goto found;
+pass:
+ if (size <= BITS_PER_LONG)
+ break;
size -= BITS_PER_LONG;
+ offset = 0;
p++;
}
- if (!size)
- return result;
-
- tmp = __reverse_ulong((unsigned char *)p);
-found_first:
- tmp &= (~0UL << (BITS_PER_LONG - size));
- if (!tmp) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __reverse_ffs(tmp);
+ return result;
+found:
+ return result - size + __reverse_ffs(tmp);
}
static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
const unsigned long *p = addr + BIT_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG - 1);
+ unsigned long result = size;
unsigned long tmp;
if (offset >= size)
return size;
- size -= result;
+ size -= (offset & ~(BITS_PER_LONG - 1));
offset %= BITS_PER_LONG;
- if (!offset)
- goto aligned;
-
- tmp = __reverse_ulong((unsigned char *)p);
- tmp |= ~((~0UL << offset) >> offset);
-
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp != ~0UL)
- goto found_middle;
-
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- p++;
-aligned:
- while (size & ~(BITS_PER_LONG - 1)) {
+
+ while (1) {
+ if (*p == ~0UL)
+ goto pass;
+
tmp = __reverse_ulong((unsigned char *)p);
+
+ if (offset)
+ tmp |= ~0UL << (BITS_PER_LONG - offset);
+ if (size < BITS_PER_LONG)
+ tmp |= ~0UL >> size;
if (tmp != ~0UL)
- goto found_middle;
- result += BITS_PER_LONG;
+ goto found;
+pass:
+ if (size <= BITS_PER_LONG)
+ break;
size -= BITS_PER_LONG;
+ offset = 0;
p++;
}
- if (!size)
- return result;
-
- tmp = __reverse_ulong((unsigned char *)p);
-found_first:
- tmp |= ~(~0UL << (BITS_PER_LONG - size));
- if (tmp == ~0UL) /* Are any bits zero? */
- return result + size; /* Nope. */
-found_middle:
- return result + __reverse_ffz(tmp);
+ return result;
+found:
+ return result - size + __reverse_ffz(tmp);
}
void register_inmem_page(struct inode *inode, struct page *page)
@@ -211,69 +191,145 @@ void register_inmem_page(struct inode *inode, struct page *page)
trace_f2fs_register_inmem_page(page, INMEM);
}
-int commit_inmem_pages(struct inode *inode, bool abort)
+static int __revoke_inmem_pages(struct inode *inode,
+ struct list_head *head, bool drop, bool recover)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct inmem_pages *cur, *tmp;
+ int err = 0;
+
+ list_for_each_entry_safe(cur, tmp, head, list) {
+ struct page *page = cur->page;
+
+ if (drop)
+ trace_f2fs_commit_inmem_page(page, INMEM_DROP);
+
+ lock_page(page);
+
+ if (recover) {
+ struct dnode_of_data dn;
+ struct node_info ni;
+
+ trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
+ err = -EAGAIN;
+ goto next;
+ }
+ get_node_info(sbi, dn.nid, &ni);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+ cur->old_addr, ni.version, true, true);
+ f2fs_put_dnode(&dn);
+ }
+next:
+ ClearPageUptodate(page);
+ set_page_private(page, 0);
+ ClearPageUptodate(page);
+ f2fs_put_page(page, 1);
+
+ list_del(&cur->list);
+ kmem_cache_free(inmem_entry_slab, cur);
+ dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+ }
+ return err;
+}
+
+void drop_inmem_pages(struct inode *inode)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+
+ mutex_lock(&fi->inmem_lock);
+ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+ mutex_unlock(&fi->inmem_lock);
+}
+
+static int __commit_inmem_pages(struct inode *inode,
+ struct list_head *revoke_list)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct inmem_pages *cur, *tmp;
- bool submit_bio = false;
struct f2fs_io_info fio = {
.sbi = sbi,
.type = DATA,
.rw = WRITE_SYNC | REQ_PRIO,
.encrypted_page = NULL,
};
+ bool submit_bio = false;
int err = 0;
- /*
- * The abort is true only when f2fs_evict_inode is called.
- * Basically, the f2fs_evict_inode doesn't produce any data writes, so
- * that we don't need to call f2fs_balance_fs.
- * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
- * inode becomes free by iget_locked in f2fs_iget.
- */
- if (!abort) {
- f2fs_balance_fs(sbi);
- f2fs_lock_op(sbi);
- }
-
- mutex_lock(&fi->inmem_lock);
list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
- lock_page(cur->page);
- if (!abort) {
- if (cur->page->mapping == inode->i_mapping) {
- set_page_dirty(cur->page);
- f2fs_wait_on_page_writeback(cur->page, DATA);
- if (clear_page_dirty_for_io(cur->page))
- inode_dec_dirty_pages(inode);
- trace_f2fs_commit_inmem_page(cur->page, INMEM);
- fio.page = cur->page;
- err = do_write_data_page(&fio);
- if (err) {
- unlock_page(cur->page);
- break;
- }
- clear_cold_data(cur->page);
- submit_bio = true;
+ struct page *page = cur->page;
+
+ lock_page(page);
+ if (page->mapping == inode->i_mapping) {
+ trace_f2fs_commit_inmem_page(page, INMEM);
+
+ set_page_dirty(page);
+ f2fs_wait_on_page_writeback(page, DATA, true);
+ if (clear_page_dirty_for_io(page))
+ inode_dec_dirty_pages(inode);
+
+ fio.page = page;
+ err = do_write_data_page(&fio);
+ if (err) {
+ unlock_page(page);
+ break;
}
- } else {
- trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
+
+ /* record old blkaddr for revoking */
+ cur->old_addr = fio.old_blkaddr;
+
+ clear_cold_data(page);
+ submit_bio = true;
}
- set_page_private(cur->page, 0);
- ClearPagePrivate(cur->page);
- f2fs_put_page(cur->page, 1);
+ unlock_page(page);
+ list_move_tail(&cur->list, revoke_list);
+ }
- list_del(&cur->list);
- kmem_cache_free(inmem_entry_slab, cur);
- dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
+ if (submit_bio)
+ f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE);
+
+ if (!err)
+ __revoke_inmem_pages(inode, revoke_list, false, false);
+
+ return err;
+}
+
+int commit_inmem_pages(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct list_head revoke_list;
+ int err;
+
+ INIT_LIST_HEAD(&revoke_list);
+ f2fs_balance_fs(sbi, true);
+ f2fs_lock_op(sbi);
+
+ mutex_lock(&fi->inmem_lock);
+ err = __commit_inmem_pages(inode, &revoke_list);
+ if (err) {
+ int ret;
+ /*
+ * try to revoke all committed pages, but still we could fail
+ * due to no memory or other reason, if that happened, EAGAIN
+ * will be returned, which means in such case, transaction is
+ * already not integrity, caller should use journal to do the
+ * recovery or rewrite & commit last transaction. For other
+ * error number, revoking was done by filesystem itself.
+ */
+ ret = __revoke_inmem_pages(inode, &revoke_list, false, true);
+ if (ret)
+ err = ret;
+
+ /* drop all uncommitted pages */
+ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
}
mutex_unlock(&fi->inmem_lock);
- if (!abort) {
- f2fs_unlock_op(sbi);
- if (submit_bio)
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
- }
+ f2fs_unlock_op(sbi);
return err;
}
@@ -281,8 +337,10 @@ int commit_inmem_pages(struct inode *inode, bool abort)
* This function balances dirty node and dentry pages.
* In addition, it controls garbage collection.
*/
-void f2fs_balance_fs(struct f2fs_sb_info *sbi)
+void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
{
+ if (!need)
+ return;
/*
* We should do GC or end up with checkpoint, if there are so many dirty
* dir/node pages without enough free segments.
@@ -308,10 +366,20 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
/* checkpoint is the only way to shrink partial cached entries */
if (!available_free_memory(sbi, NAT_ENTRIES) ||
- excess_prefree_segs(sbi) ||
!available_free_memory(sbi, INO_ENTRIES) ||
- jiffies > sbi->cp_expires)
+ excess_prefree_segs(sbi) ||
+ excess_dirty_nats(sbi) ||
+ (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
+ if (test_opt(sbi, DATA_FLUSH)) {
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ sync_dirty_inodes(sbi, FILE_INODE);
+ blk_finish_plug(&plug);
+ }
f2fs_sync_fs(sbi->sb, true);
+ stat_inc_bg_cp_count(sbi->stat_info);
+ }
}
static int issue_flush_thread(void *data)
@@ -515,7 +583,7 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
{
- int err = -ENOTSUPP;
+ int err = -EOPNOTSUPP;
if (test_opt(sbi, DISCARD)) {
struct seg_entry *se = get_seg_entry(sbi,
@@ -817,12 +885,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
}
}
- sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
+ sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE -
SUM_FOOTER_SIZE) / SUMMARY_SIZE;
if (valid_sum_count <= sum_in_page)
return 1;
else if ((valid_sum_count - sum_in_page) <=
- (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
+ (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
return 2;
return 3;
}
@@ -841,9 +909,9 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
void *dst = page_address(page);
if (src)
- memcpy(dst, src, PAGE_CACHE_SIZE);
+ memcpy(dst, src, PAGE_SIZE);
else
- memset(dst, 0, PAGE_CACHE_SIZE);
+ memset(dst, 0, PAGE_SIZE);
set_page_dirty(page);
f2fs_put_page(page, 1);
}
@@ -854,6 +922,31 @@ static void write_sum_page(struct f2fs_sb_info *sbi,
update_meta_page(sbi, (void *)sum_blk, blk_addr);
}
+static void write_current_sum_page(struct f2fs_sb_info *sbi,
+ int type, block_t blk_addr)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ struct page *page = grab_meta_page(sbi, blk_addr);
+ struct f2fs_summary_block *src = curseg->sum_blk;
+ struct f2fs_summary_block *dst;
+
+ dst = (struct f2fs_summary_block *)page_address(page);
+
+ mutex_lock(&curseg->curseg_mutex);
+
+ down_read(&curseg->journal_rwsem);
+ memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
+ up_read(&curseg->journal_rwsem);
+
+ memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
+ memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
+
+ mutex_unlock(&curseg->curseg_mutex);
+
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+}
+
static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -886,9 +979,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
segno = find_next_zero_bit(free_i->free_segmap,
- MAIN_SEGS(sbi), *newseg + 1);
- if (segno - *newseg < sbi->segs_per_sec -
- (*newseg % sbi->segs_per_sec))
+ (hint + 1) * sbi->segs_per_sec, *newseg + 1);
+ if (segno < (hint + 1) * sbi->segs_per_sec)
goto got_it;
}
find_other_zone:
@@ -1134,6 +1226,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
unsigned int start_segno, end_segno;
struct cp_control cpc;
+ int err = 0;
if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
return -EINVAL;
@@ -1164,12 +1257,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
sbi->segs_per_sec) - 1, end_segno);
mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, &cpc);
+ err = write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
}
out:
range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
- return 0;
+ return err;
}
static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
@@ -1292,8 +1385,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
int type = __get_segment_type(fio->page, fio->type);
- allocate_data_block(fio->sbi, fio->page, fio->blk_addr,
- &fio->blk_addr, sum, type);
+ allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
+ &fio->new_blkaddr, sum, type);
/* writeout dirty page into bdev */
f2fs_submit_page_mbio(fio);
@@ -1305,7 +1398,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
.sbi = sbi,
.type = META,
.rw = WRITE_SYNC | REQ_META | REQ_PRIO,
- .blk_addr = page->index,
+ .old_blkaddr = page->index,
+ .new_blkaddr = page->index,
.page = page,
.encrypted_page = NULL,
};
@@ -1335,19 +1429,19 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
do_write_page(&sum, fio);
- dn->data_blkaddr = fio->blk_addr;
+ f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
}
void rewrite_data_page(struct f2fs_io_info *fio)
{
+ fio->new_blkaddr = fio->old_blkaddr;
stat_inc_inplace_blocks(fio->sbi);
f2fs_submit_page_mbio(fio);
}
-static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
- struct f2fs_summary *sum,
+void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
block_t old_blkaddr, block_t new_blkaddr,
- bool recover_curseg)
+ bool recover_curseg, bool recover_newaddr)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg;
@@ -1390,7 +1484,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
- if (!recover_curseg)
+ if (!recover_curseg || recover_newaddr)
update_sit_entry(sbi, new_blkaddr, 1);
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
update_sit_entry(sbi, old_blkaddr, -1);
@@ -1414,66 +1508,30 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
block_t old_addr, block_t new_addr,
- unsigned char version, bool recover_curseg)
+ unsigned char version, bool recover_curseg,
+ bool recover_newaddr)
{
struct f2fs_summary sum;
set_summary(&sum, dn->nid, dn->ofs_in_node, version);
- __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg);
+ __f2fs_replace_block(sbi, &sum, old_addr, new_addr,
+ recover_curseg, recover_newaddr);
- dn->data_blkaddr = new_addr;
- set_data_blkaddr(dn);
- f2fs_update_extent_cache(dn);
-}
-
-static inline bool is_merged_page(struct f2fs_sb_info *sbi,
- struct page *page, enum page_type type)
-{
- enum page_type btype = PAGE_TYPE_OF_BIO(type);
- struct f2fs_bio_info *io = &sbi->write_io[btype];
- struct bio_vec *bvec;
- struct page *target;
- int i;
-
- down_read(&io->io_rwsem);
- if (!io->bio) {
- up_read(&io->io_rwsem);
- return false;
- }
-
- bio_for_each_segment_all(bvec, io->bio, i) {
-
- if (bvec->bv_page->mapping) {
- target = bvec->bv_page;
- } else {
- struct f2fs_crypto_ctx *ctx;
-
- /* encrypted page */
- ctx = (struct f2fs_crypto_ctx *)page_private(
- bvec->bv_page);
- target = ctx->w.control_page;
- }
-
- if (page == target) {
- up_read(&io->io_rwsem);
- return true;
- }
- }
-
- up_read(&io->io_rwsem);
- return false;
+ f2fs_update_data_blkaddr(dn, new_addr);
}
void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type)
+ enum page_type type, bool ordered)
{
if (PageWriteback(page)) {
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
- if (is_merged_page(sbi, page, type))
- f2fs_submit_merged_bio(sbi, type, WRITE);
- wait_on_page_writeback(page);
+ f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE);
+ if (ordered)
+ wait_on_page_writeback(page);
+ else
+ wait_for_stable_page(page);
}
}
@@ -1489,7 +1547,7 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
if (cpage) {
- f2fs_wait_on_page_writeback(cpage, DATA);
+ f2fs_wait_on_page_writeback(cpage, DATA, true);
f2fs_put_page(cpage, 1);
}
}
@@ -1510,12 +1568,11 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
/* Step 1: restore nat cache */
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
- memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
+ memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE);
/* Step 2: restore sit cache */
seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
- memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
- SUM_JOURNAL_SIZE);
+ memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE);
offset = 2 * SUM_JOURNAL_SIZE;
/* Step 3: restore summary entries */
@@ -1539,7 +1596,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
s = (struct f2fs_summary *)(kaddr + offset);
seg_i->sum_blk->entries[j] = *s;
offset += SUMMARY_SIZE;
- if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+ if (offset + SUMMARY_SIZE <= PAGE_SIZE -
SUM_FOOTER_SIZE)
continue;
@@ -1611,7 +1668,14 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
/* set uncompleted segment to curseg */
curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
- memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
+
+ /* update journal info */
+ down_write(&curseg->journal_rwsem);
+ memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
+ up_write(&curseg->journal_rwsem);
+
+ memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
+ memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
curseg->next_segno = segno;
reset_curseg(sbi, type, 0);
curseg->alloc_type = ckpt->alloc_type[type];
@@ -1666,13 +1730,12 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
/* Step 1: write nat cache */
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
- memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
+ memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE);
written_size += SUM_JOURNAL_SIZE;
/* Step 2: write sit cache */
seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
- memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
- SUM_JOURNAL_SIZE);
+ memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE);
written_size += SUM_JOURNAL_SIZE;
/* Step 3: write summary entries */
@@ -1694,7 +1757,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
*summary = seg_i->sum_blk->entries[j];
written_size += SUMMARY_SIZE;
- if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+ if (written_size + SUMMARY_SIZE <= PAGE_SIZE -
SUM_FOOTER_SIZE)
continue;
@@ -1718,12 +1781,8 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi,
else
end = type + NR_CURSEG_NODE_TYPE;
- for (i = type; i < end; i++) {
- struct curseg_info *sum = CURSEG_I(sbi, i);
- mutex_lock(&sum->curseg_mutex);
- write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
- mutex_unlock(&sum->curseg_mutex);
- }
+ for (i = type; i < end; i++)
+ write_current_sum_page(sbi, i, blkaddr + (i - type));
}
void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
@@ -1739,24 +1798,24 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
}
-int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
+int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
unsigned int val, int alloc)
{
int i;
if (type == NAT_JOURNAL) {
- for (i = 0; i < nats_in_cursum(sum); i++) {
- if (le32_to_cpu(nid_in_journal(sum, i)) == val)
+ for (i = 0; i < nats_in_cursum(journal); i++) {
+ if (le32_to_cpu(nid_in_journal(journal, i)) == val)
return i;
}
- if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
- return update_nats_in_cursum(sum, 1);
+ if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL))
+ return update_nats_in_cursum(journal, 1);
} else if (type == SIT_JOURNAL) {
- for (i = 0; i < sits_in_cursum(sum); i++)
- if (le32_to_cpu(segno_in_journal(sum, i)) == val)
+ for (i = 0; i < sits_in_cursum(journal); i++)
+ if (le32_to_cpu(segno_in_journal(journal, i)) == val)
return i;
- if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
- return update_sits_in_cursum(sum, 1);
+ if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL))
+ return update_sits_in_cursum(journal, 1);
}
return -1;
}
@@ -1785,7 +1844,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
- memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+ memcpy(dst_addr, src_addr, PAGE_SIZE);
set_page_dirty(dst_page);
f2fs_put_page(src_page, 1);
@@ -1860,20 +1919,22 @@ static void add_sits_in_set(struct f2fs_sb_info *sbi)
static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
{
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int i;
- for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+ down_write(&curseg->journal_rwsem);
+ for (i = 0; i < sits_in_cursum(journal); i++) {
unsigned int segno;
bool dirtied;
- segno = le32_to_cpu(segno_in_journal(sum, i));
+ segno = le32_to_cpu(segno_in_journal(journal, i));
dirtied = __mark_sit_entry_dirty(sbi, segno);
if (!dirtied)
add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
}
- update_sits_in_cursum(sum, -sits_in_cursum(sum));
+ update_sits_in_cursum(journal, -i);
+ up_write(&curseg->journal_rwsem);
}
/*
@@ -1885,13 +1946,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct sit_info *sit_i = SIT_I(sbi);
unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
struct sit_entry_set *ses, *tmp;
struct list_head *head = &SM_I(sbi)->sit_entry_set;
bool to_journal = true;
struct seg_entry *se;
- mutex_lock(&curseg->curseg_mutex);
mutex_lock(&sit_i->sentry_lock);
if (!sit_i->dirty_sentries)
@@ -1908,7 +1968,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* entries, remove all entries from journal and add and account
* them in sit entry set.
*/
- if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
+ if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL))
remove_sits_in_journal(sbi);
/*
@@ -1925,10 +1985,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
unsigned int segno = start_segno;
if (to_journal &&
- !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
+ !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL))
to_journal = false;
- if (!to_journal) {
+ if (to_journal) {
+ down_write(&curseg->journal_rwsem);
+ } else {
page = get_next_sit_page(sbi, start_segno);
raw_sit = page_address(page);
}
@@ -1946,13 +2008,13 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
}
if (to_journal) {
- offset = lookup_journal_in_cursum(sum,
+ offset = lookup_journal_in_cursum(journal,
SIT_JOURNAL, segno, 1);
f2fs_bug_on(sbi, offset < 0);
- segno_in_journal(sum, offset) =
+ segno_in_journal(journal, offset) =
cpu_to_le32(segno);
seg_info_to_raw_sit(se,
- &sit_in_journal(sum, offset));
+ &sit_in_journal(journal, offset));
} else {
sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
seg_info_to_raw_sit(se,
@@ -1964,7 +2026,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
ses->entry_cnt--;
}
- if (!to_journal)
+ if (to_journal)
+ up_write(&curseg->journal_rwsem);
+ else
f2fs_put_page(page, 1);
f2fs_bug_on(sbi, ses->entry_cnt);
@@ -1979,7 +2043,6 @@ out:
add_discard_addrs(sbi, cpc);
}
mutex_unlock(&sit_i->sentry_lock);
- mutex_unlock(&curseg->curseg_mutex);
set_prefree_as_free_segments(sbi);
}
@@ -2108,9 +2171,14 @@ static int build_curseg(struct f2fs_sb_info *sbi)
for (i = 0; i < NR_CURSEG_TYPE; i++) {
mutex_init(&array[i].curseg_mutex);
- array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!array[i].sum_blk)
return -ENOMEM;
+ init_rwsem(&array[i].journal_rwsem);
+ array[i].journal = kzalloc(sizeof(struct f2fs_journal),
+ GFP_KERNEL);
+ if (!array[i].journal)
+ return -ENOMEM;
array[i].segno = NULL_SEGNO;
array[i].next_blkoff = 0;
}
@@ -2121,11 +2189,11 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
- struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct f2fs_journal *journal = curseg->journal;
int sit_blk_cnt = SIT_BLK_CNT(sbi);
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
- int nrpages = MAX_BIO_BLOCKS(sbi);
+ int nrpages = MAX_BIO_BLOCKS(sbi) * 8;
do {
readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
@@ -2139,16 +2207,16 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
struct f2fs_sit_entry sit;
struct page *page;
- mutex_lock(&curseg->curseg_mutex);
- for (i = 0; i < sits_in_cursum(sum); i++) {
- if (le32_to_cpu(segno_in_journal(sum, i))
+ down_read(&curseg->journal_rwsem);
+ for (i = 0; i < sits_in_cursum(journal); i++) {
+ if (le32_to_cpu(segno_in_journal(journal, i))
== start) {
- sit = sit_in_journal(sum, i);
- mutex_unlock(&curseg->curseg_mutex);
+ sit = sit_in_journal(journal, i);
+ up_read(&curseg->journal_rwsem);
goto got_it;
}
}
- mutex_unlock(&curseg->curseg_mutex);
+ up_read(&curseg->journal_rwsem);
page = get_current_sit_page(sbi, start);
sit_blk = (struct f2fs_sit_block *)page_address(page);
@@ -2383,8 +2451,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi)
if (!array)
return;
SM_I(sbi)->curseg_array = NULL;
- for (i = 0; i < NR_CURSEG_TYPE; i++)
+ for (i = 0; i < NR_CURSEG_TYPE; i++) {
kfree(array[i].sum_blk);
+ kfree(array[i].journal);
+ }
kfree(array);
}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ee44d346ea44..975c33df65c7 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -183,7 +183,7 @@ struct segment_allocation {
* this value is set in page as a private data which indicate that
* the page is atomically written, and it is in inmem_pages list.
*/
-#define ATOMIC_WRITTEN_PAGE 0x0000ffff
+#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1)
#define IS_ATOMIC_WRITTEN_PAGE(page) \
(page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE)
@@ -191,6 +191,7 @@ struct segment_allocation {
struct inmem_pages {
struct list_head list;
struct page *page;
+ block_t old_addr; /* for revoking when fail to commit */
};
struct sit_info {
@@ -257,6 +258,8 @@ struct victim_selection {
struct curseg_info {
struct mutex curseg_mutex; /* lock for consistency */
struct f2fs_summary_block *sum_blk; /* cached summary block */
+ struct rw_semaphore journal_rwsem; /* protect journal area */
+ struct f2fs_journal *journal; /* cached journal info */
unsigned char alloc_type; /* current allocation type */
unsigned int segno; /* current segment number */
unsigned short next_blkoff; /* next block offset to write */
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index da0d8e0b55a5..93606f281bf9 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -32,7 +32,8 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
{
- return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node);
+ return atomic_read(&sbi->total_zombie_tree) +
+ atomic_read(&sbi->total_ext_node);
}
unsigned long f2fs_shrink_count(struct shrinker *shrink,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3a65e0132352..006f87d69921 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -67,6 +67,7 @@ enum {
Opt_extent_cache,
Opt_noextent_cache,
Opt_noinline_data,
+ Opt_data_flush,
Opt_err,
};
@@ -91,6 +92,7 @@ static match_table_t f2fs_tokens = {
{Opt_extent_cache, "extent_cache"},
{Opt_noextent_cache, "noextent_cache"},
{Opt_noinline_data, "noinline_data"},
+ {Opt_data_flush, "data_flush"},
{Opt_err, NULL},
};
@@ -124,6 +126,19 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
return NULL;
}
+static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)(sbi->kbytes_written +
+ BD_PART_WRITTEN(sbi)));
+}
+
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -202,6 +217,9 @@ static struct f2fs_attr f2fs_attr_##_name = { \
f2fs_sbi_show, f2fs_sbi_store, \
offsetof(struct struct_name, elname))
+#define F2FS_GENERAL_RO_ATTR(name) \
+static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
+
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
@@ -214,9 +232,12 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -234,7 +255,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
ATTR_LIST(ra_nid_pages),
+ ATTR_LIST(dirty_nats_ratio),
ATTR_LIST(cp_interval),
+ ATTR_LIST(idle_interval),
+ ATTR_LIST(lifetime_write_kbytes),
NULL,
};
@@ -406,6 +430,9 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_noinline_data:
clear_opt(sbi, INLINE_DATA);
break;
+ case Opt_data_flush:
+ set_opt(sbi, DATA_FLUSH);
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -432,6 +459,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
fi->i_current_depth = 1;
fi->i_advise = 0;
init_rwsem(&fi->i_sem);
+ INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
@@ -442,10 +470,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- fi->i_crypt_info = NULL;
-#endif
return &fi->vfs_inode;
}
@@ -466,7 +490,7 @@ static int f2fs_drop_inode(struct inode *inode)
/* some remained atomic pages should discarded */
if (f2fs_is_atomic_file(inode))
- commit_inmem_pages(inode, true);
+ drop_inmem_pages(inode);
/* should remain fi->extent_tree for writepage */
f2fs_destroy_extent_node(inode);
@@ -479,11 +503,7 @@ static int f2fs_drop_inode(struct inode *inode)
sb_end_intwrite(inode->i_sb);
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
- if (F2FS_I(inode)->i_crypt_info)
- f2fs_free_encryption_info(inode,
- F2FS_I(inode)->i_crypt_info);
-#endif
+ fscrypt_put_encryption_info(inode, NULL);
spin_lock(&inode->i_lock);
atomic_dec(&inode->i_count);
}
@@ -548,12 +568,16 @@ static void f2fs_put_super(struct super_block *sb)
* normally superblock is clean, so we need to release this.
* In addition, EIO will skip do checkpoint, we need this as well.
*/
- release_dirty_inode(sbi);
+ release_ino_entry(sbi);
release_discard_addrs(sbi);
f2fs_leave_shrinker(sbi);
mutex_unlock(&sbi->umount_mutex);
+ /* our cp_error case, we can wait for any writeback page */
+ if (get_pages(sbi, F2FS_WRITEBACK))
+ f2fs_flush_merged_bios(sbi);
+
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -566,13 +590,16 @@ static void f2fs_put_super(struct super_block *sb)
wait_for_completion(&sbi->s_kobj_unregister);
sb->s_fs_info = NULL;
- brelse(sbi->raw_super_buf);
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
+ kfree(sbi->raw_super);
kfree(sbi);
}
int f2fs_sync_fs(struct super_block *sb, int sync)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int err = 0;
trace_f2fs_sync_fs(sb, sync);
@@ -582,14 +609,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
cpc.reason = __get_cp_reason(sbi);
mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, &cpc);
+ err = write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
- } else {
- f2fs_balance_fs(sbi);
}
f2fs_trace_ios(NULL, 1);
- return 0;
+ return err;
}
static int f2fs_freeze(struct super_block *sb)
@@ -686,6 +711,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",extent_cache");
else
seq_puts(seq, ",noextent_cache");
+ if (test_opt(sbi, DATA_FLUSH))
+ seq_puts(seq, ",data_flush");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
return 0;
@@ -757,8 +784,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool need_stop_gc = false;
bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
- sync_filesystem(sb);
-
/*
* Save the old mount options in case we
* need to restore them.
@@ -766,6 +791,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
org_mount_opt = sbi->mount_opt;
active_logs = sbi->active_logs;
+ if (*flags & MS_RDONLY) {
+ set_opt(sbi, FASTBOOT);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
+ }
+
+ sync_filesystem(sb);
+
sbi->mount_opt.opt = 0;
default_options(sbi);
@@ -853,6 +885,41 @@ static struct super_operations f2fs_sops = {
.remount_fs = f2fs_remount,
};
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, NULL);
+}
+
+static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
+ void *fs_data)
+{
+ return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+ F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+ ctx, len, fs_data, XATTR_CREATE);
+}
+
+static unsigned f2fs_max_namelen(struct inode *inode)
+{
+ return S_ISLNK(inode->i_mode) ?
+ inode->i_sb->s_blocksize : F2FS_NAME_LEN;
+}
+
+static struct fscrypt_operations f2fs_cryptops = {
+ .get_context = f2fs_get_context,
+ .set_context = f2fs_set_context,
+ .is_encrypted = f2fs_encrypted_inode,
+ .empty_dir = f2fs_empty_dir,
+ .max_namelen = f2fs_max_namelen,
+};
+#else
+static struct fscrypt_operations f2fs_cryptops = {
+ .is_encrypted = f2fs_encrypted_inode,
+};
+#endif
+
static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
{
@@ -898,7 +965,7 @@ static const struct export_operations f2fs_export_ops = {
.get_parent = f2fs_get_parent,
};
-static loff_t max_file_size(unsigned bits)
+static loff_t max_file_blocks(void)
{
loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
loff_t leaf_count = ADDRS_PER_BLOCK;
@@ -914,13 +981,128 @@ static loff_t max_file_size(unsigned bits)
leaf_count *= NIDS_PER_BLOCK;
result += leaf_count;
- result <<= bits;
return result;
}
+static int __f2fs_commit_super(struct buffer_head *bh,
+ struct f2fs_super_block *super)
+{
+ lock_buffer(bh);
+ if (super)
+ memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
+ set_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ /* it's rare case, we can do fua all the time */
+ return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+}
+
+static inline bool sanity_check_area_boundary(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
+ (bh->b_data + F2FS_SUPER_OFFSET);
+ u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
+ u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr);
+ u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr);
+ u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr);
+ u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
+ u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
+ u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt);
+ u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit);
+ u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat);
+ u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa);
+ u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main);
+ u32 segment_count = le32_to_cpu(raw_super->segment_count);
+ u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+ u64 main_end_blkaddr = main_blkaddr +
+ (segment_count_main << log_blocks_per_seg);
+ u64 seg_end_blkaddr = segment0_blkaddr +
+ (segment_count << log_blocks_per_seg);
+
+ if (segment0_blkaddr != cp_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Mismatch start address, segment0(%u) cp_blkaddr(%u)",
+ segment0_blkaddr, cp_blkaddr);
+ return true;
+ }
+
+ if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) !=
+ sit_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong CP boundary, start(%u) end(%u) blocks(%u)",
+ cp_blkaddr, sit_blkaddr,
+ segment_count_ckpt << log_blocks_per_seg);
+ return true;
+ }
+
+ if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) !=
+ nat_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong SIT boundary, start(%u) end(%u) blocks(%u)",
+ sit_blkaddr, nat_blkaddr,
+ segment_count_sit << log_blocks_per_seg);
+ return true;
+ }
+
+ if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) !=
+ ssa_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong NAT boundary, start(%u) end(%u) blocks(%u)",
+ nat_blkaddr, ssa_blkaddr,
+ segment_count_nat << log_blocks_per_seg);
+ return true;
+ }
+
+ if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) !=
+ main_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong SSA boundary, start(%u) end(%u) blocks(%u)",
+ ssa_blkaddr, main_blkaddr,
+ segment_count_ssa << log_blocks_per_seg);
+ return true;
+ }
+
+ if (main_end_blkaddr > seg_end_blkaddr) {
+ f2fs_msg(sb, KERN_INFO,
+ "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)",
+ main_blkaddr,
+ segment0_blkaddr +
+ (segment_count << log_blocks_per_seg),
+ segment_count_main << log_blocks_per_seg);
+ return true;
+ } else if (main_end_blkaddr < seg_end_blkaddr) {
+ int err = 0;
+ char *res;
+
+ /* fix in-memory information all the time */
+ raw_super->segment_count = cpu_to_le32((main_end_blkaddr -
+ segment0_blkaddr) >> log_blocks_per_seg);
+
+ if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) {
+ res = "internally";
+ } else {
+ err = __f2fs_commit_super(bh, NULL);
+ res = err ? "failed" : "done";
+ }
+ f2fs_msg(sb, KERN_INFO,
+ "Fix alignment : %s, start(%u) end(%u) block(%u)",
+ res, main_blkaddr,
+ segment0_blkaddr +
+ (segment_count << log_blocks_per_seg),
+ segment_count_main << log_blocks_per_seg);
+ if (err)
+ return true;
+ }
+ return false;
+}
+
static int sanity_check_raw_super(struct super_block *sb,
- struct f2fs_super_block *raw_super)
+ struct buffer_head *bh)
{
+ struct f2fs_super_block *raw_super = (struct f2fs_super_block *)
+ (bh->b_data + F2FS_SUPER_OFFSET);
unsigned int blocksize;
if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
@@ -931,10 +1113,10 @@ static int sanity_check_raw_super(struct super_block *sb,
}
/* Currently, support only 4KB page cache size */
- if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) {
+ if (F2FS_BLKSIZE != PAGE_SIZE) {
f2fs_msg(sb, KERN_INFO,
"Invalid page_cache_size (%lu), supports only 4KB\n",
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
return 1;
}
@@ -947,6 +1129,14 @@ static int sanity_check_raw_super(struct super_block *sb,
return 1;
}
+ /* check log blocks per segment */
+ if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid log blocks per segment (%u)\n",
+ le32_to_cpu(raw_super->log_blocks_per_seg));
+ return 1;
+ }
+
/* Currently, support 512/1024/2048/4096 bytes sector size */
if (le32_to_cpu(raw_super->log_sectorsize) >
F2FS_MAX_LOG_SECTOR_SIZE ||
@@ -965,10 +1155,27 @@ static int sanity_check_raw_super(struct super_block *sb,
le32_to_cpu(raw_super->log_sectorsize));
return 1;
}
+
+ /* check reserved ino info */
+ if (le32_to_cpu(raw_super->node_ino) != 1 ||
+ le32_to_cpu(raw_super->meta_ino) != 2 ||
+ le32_to_cpu(raw_super->root_ino) != 3) {
+ f2fs_msg(sb, KERN_INFO,
+ "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)",
+ le32_to_cpu(raw_super->node_ino),
+ le32_to_cpu(raw_super->meta_ino),
+ le32_to_cpu(raw_super->root_ino));
+ return 1;
+ }
+
+ /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
+ if (sanity_check_area_boundary(sb, bh))
+ return 1;
+
return 0;
}
-static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
+int sanity_check_ckpt(struct f2fs_sb_info *sbi)
{
unsigned int total, fsmeta;
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
@@ -1018,7 +1225,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
atomic_set(&sbi->nr_pages[i], 0);
sbi->dir_level = DEF_DIR_LEVEL;
- sbi->cp_interval = DEF_CP_INTERVAL;
+ sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
+ sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
INIT_LIST_HEAD(&sbi->s_list);
@@ -1027,98 +1235,86 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
/*
* Read f2fs raw super block.
- * Because we have two copies of super block, so read the first one at first,
- * if the first one is invalid, move to read the second one.
+ * Because we have two copies of super block, so read both of them
+ * to get the first valid one. If any one of them is broken, we pass
+ * them recovery flag back to the caller.
*/
static int read_raw_super_block(struct super_block *sb,
struct f2fs_super_block **raw_super,
- struct buffer_head **raw_super_buf,
- int *recovery)
+ int *valid_super_block, int *recovery)
{
- int block = 0;
- struct buffer_head *buffer;
+ int block;
+ struct buffer_head *bh;
struct f2fs_super_block *super;
int err = 0;
-retry:
- buffer = sb_bread(sb, block);
- if (!buffer) {
- *recovery = 1;
- f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
+ super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL);
+ if (!super)
+ return -ENOMEM;
+
+ for (block = 0; block < 2; block++) {
+ bh = sb_bread(sb, block);
+ if (!bh) {
+ f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
block + 1);
- if (block == 0) {
- block++;
- goto retry;
- } else {
err = -EIO;
- goto out;
+ continue;
}
- }
- super = (struct f2fs_super_block *)
- ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET);
-
- /* sanity checking of raw super */
- if (sanity_check_raw_super(sb, super)) {
- brelse(buffer);
- *recovery = 1;
- f2fs_msg(sb, KERN_ERR,
- "Can't find valid F2FS filesystem in %dth superblock",
- block + 1);
- if (block == 0) {
- block++;
- goto retry;
- } else {
+ /* sanity checking of raw super */
+ if (sanity_check_raw_super(sb, bh)) {
+ f2fs_msg(sb, KERN_ERR,
+ "Can't find valid F2FS filesystem in %dth superblock",
+ block + 1);
err = -EINVAL;
- goto out;
+ brelse(bh);
+ continue;
}
- }
- if (!*raw_super) {
- *raw_super_buf = buffer;
- *raw_super = super;
- } else {
- /* already have a valid superblock */
- brelse(buffer);
+ if (!*raw_super) {
+ memcpy(super, bh->b_data + F2FS_SUPER_OFFSET,
+ sizeof(*super));
+ *valid_super_block = block;
+ *raw_super = super;
+ }
+ brelse(bh);
}
- /* check the validity of the second superblock */
- if (block == 0) {
- block++;
- goto retry;
- }
+ /* Fail to read any one of the superblocks*/
+ if (err < 0)
+ *recovery = 1;
-out:
/* No valid superblock */
if (!*raw_super)
- return err;
+ kfree(super);
+ else
+ err = 0;
- return 0;
+ return err;
}
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
{
- struct buffer_head *sbh = sbi->raw_super_buf;
- sector_t block = sbh->b_blocknr;
+ struct buffer_head *bh;
int err;
/* write back-up superblock first */
- sbh->b_blocknr = block ? 0 : 1;
- mark_buffer_dirty(sbh);
- err = sync_dirty_buffer(sbh);
-
- sbh->b_blocknr = block;
+ bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1);
+ if (!bh)
+ return -EIO;
+ err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
+ brelse(bh);
/* if we are in recovery path, skip writing valid superblock */
if (recover || err)
- goto out;
+ return err;
/* write current valid superblock */
- mark_buffer_dirty(sbh);
- err = sync_dirty_buffer(sbh);
-out:
- clear_buffer_write_io_error(sbh);
- set_buffer_uptodate(sbh);
+ bh = sb_getblk(sbi->sb, sbi->valid_super_block);
+ if (!bh)
+ return -EIO;
+ err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi));
+ brelse(bh);
return err;
}
@@ -1126,17 +1322,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
{
struct f2fs_sb_info *sbi;
struct f2fs_super_block *raw_super;
- struct buffer_head *raw_super_buf;
struct inode *root;
long err;
bool retry = true, need_fsck = false;
char *options = NULL;
- int recovery, i;
+ int recovery, i, valid_super_block;
+ struct curseg_info *seg_i;
try_onemore:
err = -EINVAL;
raw_super = NULL;
- raw_super_buf = NULL;
+ valid_super_block = -1;
recovery = 0;
/* allocate memory for f2fs-specific super block info */
@@ -1144,13 +1340,23 @@ try_onemore:
if (!sbi)
return -ENOMEM;
+ /* Load the checksum driver */
+ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
+ if (IS_ERR(sbi->s_chksum_driver)) {
+ f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver.");
+ err = PTR_ERR(sbi->s_chksum_driver);
+ sbi->s_chksum_driver = NULL;
+ goto free_sbi;
+ }
+
/* set a block size */
if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
goto free_sbi;
}
- err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery);
+ err = read_raw_super_block(sb, &raw_super, &valid_super_block,
+ &recovery);
if (err)
goto free_sbi;
@@ -1167,11 +1373,14 @@ try_onemore:
if (err)
goto free_options;
- sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
+ sbi->max_file_blocks = max_file_blocks();
+ sb->s_maxbytes = sbi->max_file_blocks <<
+ le32_to_cpu(raw_super->log_blocksize);
sb->s_max_links = F2FS_LINK_MAX;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
sb->s_op = &f2fs_sops;
+ sb->s_cop = &f2fs_cryptops;
sb->s_xattr = f2fs_xattr_handlers;
sb->s_export_op = &f2fs_export_ops;
sb->s_magic = F2FS_SUPER_MAGIC;
@@ -1183,7 +1392,7 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->sb = sb;
sbi->raw_super = raw_super;
- sbi->raw_super_buf = raw_super_buf;
+ sbi->valid_super_block = valid_super_block;
mutex_init(&sbi->gc_mutex);
mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
@@ -1220,13 +1429,6 @@ try_onemore:
goto free_meta_inode;
}
- /* sanity checking of checkpoint */
- err = -EINVAL;
- if (sanity_check_ckpt(sbi)) {
- f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
- goto free_cp;
- }
-
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
sbi->total_valid_inode_count =
@@ -1236,8 +1438,10 @@ try_onemore:
le64_to_cpu(sbi->ckpt->valid_block_count);
sbi->last_valid_block_count = sbi->total_valid_block_count;
sbi->alloc_valid_block_count = 0;
- INIT_LIST_HEAD(&sbi->dir_inode_list);
- spin_lock_init(&sbi->dir_inode_lock);
+ for (i = 0; i < NR_INODE_TYPE; i++) {
+ INIT_LIST_HEAD(&sbi->inode_list[i]);
+ spin_lock_init(&sbi->inode_lock[i]);
+ }
init_extent_cache_info(sbi);
@@ -1257,6 +1461,17 @@ try_onemore:
goto free_nm;
}
+ /* For write statistics */
+ if (sb->s_bdev->bd_part)
+ sbi->sectors_written_start =
+ (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+
+ /* Read accumulated write IO statistics if exists */
+ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
+ if (__exist_node_summaries(sbi))
+ sbi->kbytes_written =
+ le64_to_cpu(seg_i->journal->info.kbytes_written);
+
build_gc_manager(sbi);
/* get an inode for node space */
@@ -1351,16 +1566,20 @@ try_onemore:
/* recover broken superblock */
if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) {
- f2fs_msg(sb, KERN_INFO, "Recover invalid superblock");
- f2fs_commit_super(sbi, true);
+ err = f2fs_commit_super(sbi, true);
+ f2fs_msg(sb, KERN_INFO,
+ "Try to recover %dth superblock, ret: %ld",
+ sbi->valid_super_block ? 1 : 2, err);
}
- sbi->cp_expires = round_jiffies_up(jiffies);
-
+ f2fs_update_time(sbi, CP_TIME);
+ f2fs_update_time(sbi, REQ_TIME);
return 0;
free_kobj:
kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
free_proc:
if (sbi->s_proc) {
remove_proc_entry("segment_info", sbi->s_proc);
@@ -1379,7 +1598,6 @@ free_nm:
destroy_node_manager(sbi);
free_sm:
destroy_segment_manager(sbi);
-free_cp:
kfree(sbi->ckpt);
free_meta_inode:
make_bad_inode(sbi->meta_inode);
@@ -1387,8 +1605,10 @@ free_meta_inode:
free_options:
kfree(options);
free_sb_buf:
- brelse(raw_super_buf);
+ kfree(raw_super);
free_sbi:
+ if (sbi->s_chksum_driver)
+ crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi);
/* give only one another chance */
@@ -1424,8 +1644,9 @@ MODULE_ALIAS_FS("f2fs");
static int __init init_inodecache(void)
{
- f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
- sizeof(struct f2fs_inode_info));
+ f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
+ sizeof(struct f2fs_inode_info), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
if (!f2fs_inode_cachep)
return -ENOMEM;
return 0;
@@ -1467,25 +1688,23 @@ static int __init init_f2fs_fs(void)
err = -ENOMEM;
goto free_extent_cache;
}
- err = f2fs_init_crypto();
- if (err)
- goto free_kset;
-
err = register_shrinker(&f2fs_shrinker_info);
if (err)
- goto free_crypto;
+ goto free_kset;
err = register_filesystem(&f2fs_fs_type);
if (err)
goto free_shrinker;
- f2fs_create_root_stats();
+ err = f2fs_create_root_stats();
+ if (err)
+ goto free_filesystem;
f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
return 0;
+free_filesystem:
+ unregister_filesystem(&f2fs_fs_type);
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
-free_crypto:
- f2fs_exit_crypto();
free_kset:
kset_unregister(f2fs_kset);
free_extent_cache:
@@ -1508,7 +1727,6 @@ static void __exit exit_f2fs_fs(void)
f2fs_destroy_root_stats();
unregister_shrinker(&f2fs_shrinker_info);
unregister_filesystem(&f2fs_fs_type);
- f2fs_exit_crypto();
destroy_extent_cache();
destroy_checkpoint_caches();
destroy_segment_manager_caches();
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 145fb659ad44..562ce0821559 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -29,7 +29,8 @@ static inline void __print_last_io(void)
last_io.major, last_io.minor,
last_io.pid, "----------------",
last_io.type,
- last_io.fio.rw, last_io.fio.blk_addr,
+ last_io.fio.rw,
+ last_io.fio.new_blkaddr,
last_io.len);
memset(&last_io, 0, sizeof(last_io));
}
@@ -101,7 +102,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
last_io.pid == pid &&
last_io.type == __file_type(inode, pid) &&
last_io.fio.rw == fio->rw &&
- last_io.fio.blk_addr + last_io.len == fio->blk_addr) {
+ last_io.fio.new_blkaddr + last_io.len ==
+ fio->new_blkaddr) {
last_io.len++;
return;
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 4de2286c0e4d..06a72dc0191a 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -25,49 +25,37 @@
#include "f2fs.h"
#include "xattr.h"
-static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t len, int type)
+static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- int total_len, prefix_len = 0;
- const char *prefix = NULL;
- switch (type) {
+ switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
- prefix = XATTR_USER_PREFIX;
- prefix_len = XATTR_USER_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_TRUSTED:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- prefix = XATTR_TRUSTED_PREFIX;
- prefix_len = XATTR_TRUSTED_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_SECURITY:
- prefix = XATTR_SECURITY_PREFIX;
- prefix_len = XATTR_SECURITY_PREFIX_LEN;
break;
default:
return -EINVAL;
}
-
- total_len = prefix_len + len + 1;
- if (list && total_len <= list_size) {
- memcpy(list, prefix, prefix_len);
- memcpy(list + prefix_len, name, len);
- list[prefix_len + len] = '\0';
- }
- return total_len;
+ return f2fs_getxattr(d_inode(dentry), handler->flags, name,
+ buffer, size, NULL);
}
-static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- switch (type) {
+ switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
@@ -81,72 +69,39 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
default:
return -EINVAL;
}
- if (strcmp(name, "") == 0)
- return -EINVAL;
- return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL);
+ return f2fs_setxattr(d_inode(dentry), handler->flags, name,
+ value, size, NULL, flags);
}
-static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static bool f2fs_xattr_user_list(struct dentry *dentry)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- switch (type) {
- case F2FS_XATTR_INDEX_USER:
- if (!test_opt(sbi, XATTR_USER))
- return -EOPNOTSUPP;
- break;
- case F2FS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- break;
- case F2FS_XATTR_INDEX_SECURITY:
- break;
- default:
- return -EINVAL;
- }
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- return f2fs_setxattr(d_inode(dentry), type, name,
- value, size, NULL, flags);
+ return test_opt(sbi, XATTR_USER);
}
-static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t len, int type)
+static bool f2fs_xattr_trusted_list(struct dentry *dentry)
{
- const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
- size_t size;
-
- if (type != F2FS_XATTR_INDEX_ADVISE)
- return 0;
-
- size = strlen(xname) + 1;
- if (list && size <= list_size)
- memcpy(list, xname, size);
- return size;
+ return capable(CAP_SYS_ADMIN);
}
-static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
if (buffer)
*((char *)buffer) = F2FS_I(inode)->i_advise;
return sizeof(char);
}
-static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
{
struct inode *inode = d_inode(dentry);
- if (strcmp(name, "") != 0)
- return -EINVAL;
if (!inode_owner_or_capable(inode))
return -EPERM;
if (value == NULL)
@@ -185,7 +140,7 @@ int f2fs_init_security(struct inode *inode, struct inode *dir,
const struct xattr_handler f2fs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = F2FS_XATTR_INDEX_USER,
- .list = f2fs_xattr_generic_list,
+ .list = f2fs_xattr_user_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
@@ -193,15 +148,14 @@ const struct xattr_handler f2fs_xattr_user_handler = {
const struct xattr_handler f2fs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.flags = F2FS_XATTR_INDEX_TRUSTED,
- .list = f2fs_xattr_generic_list,
+ .list = f2fs_xattr_trusted_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
const struct xattr_handler f2fs_xattr_advise_handler = {
- .prefix = F2FS_SYSTEM_ADVISE_PREFIX,
+ .name = F2FS_SYSTEM_ADVISE_NAME,
.flags = F2FS_XATTR_INDEX_ADVISE,
- .list = f2fs_xattr_advise_list,
.get = f2fs_xattr_advise_get,
.set = f2fs_xattr_advise_set,
};
@@ -209,7 +163,6 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
const struct xattr_handler f2fs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.flags = F2FS_XATTR_INDEX_SECURITY,
- .list = f2fs_xattr_generic_list,
.get = f2fs_xattr_generic_get,
.set = f2fs_xattr_generic_set,
};
@@ -347,7 +300,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
if (ipage) {
inline_addr = inline_xattr_addr(ipage);
- f2fs_wait_on_page_writeback(ipage, NODE);
+ f2fs_wait_on_page_writeback(ipage, NODE, true);
} else {
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -355,7 +308,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
return PTR_ERR(page);
}
inline_addr = inline_xattr_addr(page);
- f2fs_wait_on_page_writeback(page, NODE);
+ f2fs_wait_on_page_writeback(page, NODE, true);
}
memcpy(inline_addr, txattr_addr, inline_size);
f2fs_put_page(page, 1);
@@ -376,7 +329,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
return PTR_ERR(xpage);
}
f2fs_bug_on(sbi, new_nid);
- f2fs_wait_on_page_writeback(xpage, NODE);
+ f2fs_wait_on_page_writeback(xpage, NODE, true);
} else {
struct dnode_of_data dn;
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
@@ -457,20 +410,27 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
f2fs_xattr_handler(entry->e_name_index);
+ const char *prefix;
+ size_t prefix_len;
size_t size;
- if (!handler)
+ if (!handler || (handler->list && !handler->list(dentry)))
continue;
- size = handler->list(dentry, buffer, rest, entry->e_name,
- entry->e_name_len, handler->flags);
- if (buffer && size > rest) {
- error = -ERANGE;
- goto cleanup;
+ prefix = handler->prefix ?: handler->name;
+ prefix_len = strlen(prefix);
+ size = prefix_len + entry->e_name_len + 1;
+ if (buffer) {
+ if (size > rest) {
+ error = -ERANGE;
+ goto cleanup;
+ }
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, entry->e_name, entry->e_name_len);
+ buffer += entry->e_name_len;
+ *buffer++ = 0;
}
-
- if (buffer)
- buffer += size;
rest -= size;
}
error = buffer_size - rest;
@@ -611,7 +571,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
if (ipage)
return __f2fs_setxattr(inode, index, name, value,
size, ipage, flags);
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
/* protect xattr_ver */
@@ -620,5 +580,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
up_write(&F2FS_I(inode)->i_sem);
f2fs_unlock_op(sbi);
+ f2fs_update_time(sbi, REQ_TIME);
return err;
}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 71a7100d5492..f990de20cdcd 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -27,7 +27,7 @@
#define F2FS_XATTR_REFCOUNT_MAX 1024
/* Name indexes */
-#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise"
+#define F2FS_SYSTEM_ADVISE_NAME "system.advise"
#define F2FS_XATTR_INDEX_USER 1
#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2
#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
@@ -126,7 +126,8 @@ extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
#define f2fs_xattr_handlers NULL
static inline int f2fs_setxattr(struct inode *inode, int index,
- const char *name, const void *value, size_t size, int flags)
+ const char *name, const void *value, size_t size,
+ struct page *page, int flags)
{
return -EOPNOTSUPP;
}
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index 182f9ffe2b51..3ff1772f612e 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -93,8 +93,24 @@ config FAT_DEFAULT_IOCHARSET
that most of your FAT filesystems use, and can be overridden
with the "iocharset" mount option for FAT filesystems.
Note that "utf8" is not recommended for FAT filesystems.
- If unsure, you shouldn't set "utf8" here.
+ If unsure, you shouldn't set "utf8" here - select the next option
+ instead if you would like to use UTF-8 encoded file names by default.
See <file:Documentation/filesystems/vfat.txt> for more information.
Enable any character sets you need in File Systems/Native Language
Support.
+
+config FAT_DEFAULT_UTF8
+ bool "Enable FAT UTF-8 option by default"
+ depends on VFAT_FS
+ default n
+ help
+ Set this if you would like to have "utf8" mount option set
+ by default when mounting FAT filesystems.
+
+ Even if you say Y here can always disable UTF-8 for
+ particular mount by adding "utf8=0" to mount options.
+
+ Say Y if you use UTF-8 encoding for file names, N otherwise.
+
+ See <file:Documentation/filesystems/vfat.txt> for more information.
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 93fc62232ec2..5d384921524d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
return dclus;
}
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int cluster, offset;
+
+ cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+ offset = sector & (sbi->sec_per_clus - 1);
+ cluster = fat_bmap_cluster(inode, cluster);
+ if (cluster < 0)
+ return cluster;
+ else if (cluster) {
+ *bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+ *mapped_blocks = sbi->sec_per_clus - offset;
+ if (*mapped_blocks > last_block - sector)
+ *mapped_blocks = last_block - sector;
+ }
+
+ return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+ sector_t *last_block, int create)
+{
+ struct super_block *sb = inode->i_sb;
const unsigned long blocksize = sb->s_blocksize;
const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+ *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+ if (sector >= *last_block) {
+ if (!create)
+ return 1;
+
+ /*
+ * ->mmu_private can access on only allocation path.
+ * (caller must hold ->i_mutex)
+ */
+ *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+ >> blocksize_bits;
+ if (sector >= *last_block)
+ return 1;
+ }
+
+ return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+ unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+ struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
sector_t last_block;
- int cluster, offset;
*phys = 0;
*mapped_blocks = 0;
@@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
return 0;
}
- last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
- if (sector >= last_block) {
- if (!create)
+ if (!from_bmap) {
+ if (is_exceed_eof(inode, sector, &last_block, create))
return 0;
-
- /*
- * ->mmu_private can access on only allocation path.
- * (caller must hold ->i_mutex)
- */
- last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
- >> blocksize_bits;
+ } else {
+ last_block = inode->i_blocks >>
+ (inode->i_sb->s_blocksize_bits - 9);
if (sector >= last_block)
return 0;
}
- cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
- offset = sector & (sbi->sec_per_clus - 1);
- cluster = fat_bmap_cluster(inode, cluster);
- if (cluster < 0)
- return cluster;
- else if (cluster) {
- *phys = fat_clus_to_blknr(sbi, cluster) + offset;
- *mapped_blocks = sbi->sec_per_clus - offset;
- if (*mapped_blocks > last_block - sector)
- *mapped_blocks = last_block - sector;
- }
- return 0;
+ return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+ phys);
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 4afc4d9d2e41..d0b95c95079b 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -91,7 +91,7 @@ next:
*bh = NULL;
iblock = *pos >> sb->s_blocksize_bits;
- err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+ err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
if (err || !phys)
return -1; /* beyond EOF or error */
@@ -610,9 +610,9 @@ parse_record:
int status = fat_parse_long(inode, &cpos, &bh, &de,
&unicode, &nr_slots);
if (status < 0) {
- ctx->pos = cpos;
+ bh = NULL;
ret = status;
- goto out;
+ goto end_of_dir;
} else if (status == PARSE_INVALID)
goto record_end;
else if (status == PARSE_NOT_LONGNAME)
@@ -654,8 +654,9 @@ parse_record:
fill_len = short_len;
start_filldir:
- if (!fake_offset)
- ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+ ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
+ if (fake_offset && ctx->pos < 2)
+ ctx->pos = 2;
if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) {
if (!dir_emit_dot(file, ctx))
@@ -681,14 +682,19 @@ record_end:
fake_offset = 0;
ctx->pos = cpos;
goto get_new;
+
end_of_dir:
- ctx->pos = cpos;
+ if (fake_offset && cpos < 2)
+ ctx->pos = 2;
+ else
+ ctx->pos = cpos;
fill_failed:
brelse(bh);
if (unicode)
__putname(unicode);
out:
mutex_unlock(&sbi->s_lock);
+
return ret;
}
@@ -763,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
buf.dirent = dirent;
buf.result = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
buf.ctx.pos = file->f_pos;
ret = -ENOENT;
if (!IS_DEADDIR(inode)) {
@@ -771,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
short_only, both ? &buf : NULL);
file->f_pos = buf.ctx.pos;
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret >= 0)
ret = buf.result;
return ret;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index be5e15323bab..e6b764a17a9c 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -87,7 +87,7 @@ struct msdos_sb_info {
unsigned int vol_id; /*volume ID*/
int fatent_shift;
- struct fatent_operations *fatent_ops;
+ const struct fatent_operations *fatent_ops;
struct inode *fat_inode;
struct inode *fsinfo_inode;
@@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
extern void fat_cache_inval_inode(struct inode *inode);
extern int fat_get_cluster(struct inode *inode, int cluster,
int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+ sector_t last_block,
+ unsigned long *mapped_blocks, sector_t *bmap);
extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
- unsigned long *mapped_blocks, int create);
+ unsigned long *mapped_blocks, int create, bool from_bmap);
/* fat/dir.c */
extern const struct file_operations fat_dir_operations;
@@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart)
{
return hash_32(logstart, FAT_HASH_BITS);
}
+extern int fat_add_cluster(struct inode *inode);
/* fat/misc.c */
extern __printf(3, 4) __cold
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 8226557130a2..1d9a8c4e9de0 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -99,7 +99,7 @@ err:
static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
int offset, sector_t blocknr)
{
- struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+ const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
@@ -246,7 +246,7 @@ static int fat32_ent_next(struct fat_entry *fatent)
return 0;
}
-static struct fatent_operations fat12_ops = {
+static const struct fatent_operations fat12_ops = {
.ent_blocknr = fat12_ent_blocknr,
.ent_set_ptr = fat12_ent_set_ptr,
.ent_bread = fat12_ent_bread,
@@ -255,7 +255,7 @@ static struct fatent_operations fat12_ops = {
.ent_next = fat12_ent_next,
};
-static struct fatent_operations fat16_ops = {
+static const struct fatent_operations fat16_ops = {
.ent_blocknr = fat_ent_blocknr,
.ent_set_ptr = fat16_ent_set_ptr,
.ent_bread = fat_ent_bread,
@@ -264,7 +264,7 @@ static struct fatent_operations fat16_ops = {
.ent_next = fat16_ent_next,
};
-static struct fatent_operations fat32_ops = {
+static const struct fatent_operations fat32_ops = {
.ent_blocknr = fat_ent_blocknr,
.ent_set_ptr = fat32_ent_set_ptr,
.ent_bread = fat_ent_bread,
@@ -320,7 +320,7 @@ static inline int fat_ent_update_ptr(struct super_block *sb,
int offset, sector_t blocknr)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- struct fatent_operations *ops = sbi->fatent_ops;
+ const struct fatent_operations *ops = sbi->fatent_ops;
struct buffer_head **bhs = fatent->bhs;
/* Is this fatent's blocks including this entry? */
@@ -349,7 +349,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
- struct fatent_operations *ops = sbi->fatent_ops;
+ const struct fatent_operations *ops = sbi->fatent_ops;
int err, offset;
sector_t blocknr;
@@ -407,7 +407,7 @@ int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
int new, int wait)
{
struct super_block *sb = inode->i_sb;
- struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+ const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
int err;
ops->ent_put(fatent, new);
@@ -432,7 +432,7 @@ static inline int fat_ent_next(struct msdos_sb_info *sbi,
static inline int fat_ent_read_block(struct super_block *sb,
struct fat_entry *fatent)
{
- struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+ const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
sector_t blocknr;
int offset;
@@ -463,7 +463,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- struct fatent_operations *ops = sbi->fatent_ops;
+ const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent, prev_ent;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
int i, count, err, nr_bhs, idx_clus;
@@ -551,7 +551,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
{
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- struct fatent_operations *ops = sbi->fatent_ops;
+ const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
int i, err, nr_bhs;
@@ -636,7 +636,7 @@ EXPORT_SYMBOL_GPL(fat_free_clusters);
static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
unsigned long reada_blocks)
{
- struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+ const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
sector_t blocknr;
int i, offset;
@@ -649,7 +649,7 @@ static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
int fat_count_free_clusters(struct super_block *sb)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
- struct fatent_operations *ops = sbi->fatent_ops;
+ const struct fatent_operations *ops = sbi->fatent_ops;
struct fat_entry fatent;
unsigned long reada_blocks, reada_mask, cur_block;
int err = 0, free;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a08f1039909a..f70185668832 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -14,15 +14,19 @@
#include <linux/backing-dev.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
+#include <linux/falloc.h>
#include "fat.h"
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len);
+
static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
{
u32 attr;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
attr = fat_make_attrs(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return put_user(attr, user_attr);
}
@@ -43,7 +47,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
err = mnt_want_write_file(file);
if (err)
goto out;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -105,7 +109,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
fat_save_attrs(inode, attr);
mark_inode_dirty(inode);
out_unlock_inode:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mnt_drop_write_file(file);
out:
return err;
@@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = {
#endif
.fsync = fat_file_fsync,
.splice_read = generic_file_splice_read,
+ .fallocate = fat_fallocate,
};
static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -215,6 +220,62 @@ out:
return err;
}
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ int nr_cluster; /* Number of clusters to be allocated */
+ loff_t mm_bytes; /* Number of bytes to be allocated for file */
+ loff_t ondisksize; /* block aligned on-disk size in bytes*/
+ struct inode *inode = file->f_mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
+ int err = 0;
+
+ /* No support for hole punch or other fallocate flags. */
+ if (mode & ~FALLOC_FL_KEEP_SIZE)
+ return -EOPNOTSUPP;
+
+ /* No support for dir */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ ondisksize = inode->i_blocks << 9;
+ if ((offset + len) <= ondisksize)
+ goto error;
+
+ /* First compute the number of clusters to be allocated */
+ mm_bytes = offset + len - ondisksize;
+ nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+ sbi->cluster_bits;
+
+ /* Start the allocation.We are not zeroing out the clusters */
+ while (nr_cluster-- > 0) {
+ err = fat_add_cluster(inode);
+ if (err)
+ goto error;
+ }
+ } else {
+ if ((offset + len) <= i_size_read(inode))
+ goto error;
+
+ /* This is just an expanding truncate */
+ err = fat_cont_expand(inode, (offset + len));
+ }
+
+error:
+ inode_unlock(inode);
+ return err;
+}
+
/* Free all clusters after the skip'th cluster. */
static int fat_free(struct inode *inode, int skip)
{
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd3698..226281068a46 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -93,7 +93,7 @@ static struct fat_floppy_defaults {
},
};
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
{
int err, cluster;
@@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
struct super_block *sb = inode->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
unsigned long mapped_blocks;
- sector_t phys;
+ sector_t phys, last_block;
int err, offset;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
if (phys) {
@@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
return -EIO;
}
+ last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
- if (!offset) {
+ /*
+ * allocate a cluster according to the following.
+ * 1) no more available blocks
+ * 2) not part of fallocate region
+ */
+ if (!offset && !(iblock < last_block)) {
/* TODO: multiple cluster allocation would be desirable. */
err = fat_add_cluster(inode);
if (err)
@@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
*max_blocks = min(mapped_blocks, *max_blocks);
MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
- err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+ err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
if (err)
return err;
@@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return ret;
}
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int err;
+ sector_t bmap;
+ unsigned long mapped_blocks;
+
+ BUG_ON(create != 0);
+
+ err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+ if (err)
+ return err;
+
+ if (bmap) {
+ map_bh(bh_result, sb, bmap);
+ max_blocks = min(mapped_blocks, max_blocks);
+ }
+
+ bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+ return 0;
+}
+
static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
{
sector_t blocknr;
/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
down_read(&MSDOS_I(mapping->host)->truncate_lock);
- blocknr = generic_block_bmap(mapping, block, fat_get_block);
+ blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
up_read(&MSDOS_I(mapping->host)->truncate_lock);
return blocknr;
@@ -449,6 +480,24 @@ static int fat_calc_dir_size(struct inode *inode)
return 0;
}
+static int fat_validate_dir(struct inode *dir)
+{
+ struct super_block *sb = dir->i_sb;
+
+ if (dir->i_nlink < 2) {
+ /* Directory should have "."/".." entries at least. */
+ fat_fs_error(sb, "corrupted directory (invalid entries)");
+ return -EIO;
+ }
+ if (MSDOS_I(dir)->i_start == 0 ||
+ MSDOS_I(dir)->i_start == MSDOS_SB(sb)->root_cluster) {
+ /* Directory should point valid cluster. */
+ fat_fs_error(sb, "corrupted directory (invalid i_start)");
+ return -EIO;
+ }
+ return 0;
+}
+
/* doesn't deal with root inode */
int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
{
@@ -475,6 +524,10 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
MSDOS_I(inode)->mmu_private = inode->i_size;
set_nlink(inode, fat_subdirs(inode));
+
+ error = fat_validate_dir(inode);
+ if (error < 0)
+ return error;
} else { /* not a directory */
inode->i_generation |= 1;
inode->i_mode = fat_make_mode(sbi, de->attr,
@@ -553,13 +606,43 @@ out:
EXPORT_SYMBOL_GPL(fat_build_inode);
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+ /* Release unwritten fallocated blocks on inode eviction. */
+ if ((inode->i_blocks << 9) >
+ round_up(MSDOS_I(inode)->mmu_private,
+ MSDOS_SB(inode->i_sb)->cluster_size)) {
+ int err;
+
+ fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+ /* Fallocate results in updating the i_start/iogstart
+ * for the zero byte file. So, make it return to
+ * original state during evict and commit it to avoid
+ * any corruption on the next access to the cluster
+ * chain for the file.
+ */
+ err = __fat_write_inode(inode, inode_needs_sync(inode));
+ if (err) {
+ fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+ "update on disk inode for unused "
+ "fallocated blocks, inode could be "
+ "corrupted. Please run fsck");
+ }
+
+ }
+}
+
static void fat_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
- }
+ } else
+ fat_free_eofblocks(inode);
+
invalidate_inode_buffers(inode);
clear_inode(inode);
fat_cache_inval_inode(inode);
@@ -677,7 +760,7 @@ static int __init fat_init_inodecache(void)
fat_inode_cachep = kmem_cache_create("fat_inode_cache",
sizeof(struct msdos_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (fat_inode_cachep == NULL)
return -ENOMEM;
@@ -1044,7 +1127,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
}
opts->name_check = 'n';
opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0;
- opts->utf8 = opts->unicode_xlate = 0;
+ opts->unicode_xlate = 0;
opts->numtail = 1;
opts->usefree = opts->nocase = 0;
opts->tz_set = 0;
@@ -1052,6 +1135,8 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
opts->errors = FAT_ERRORS_RO;
*debug = 0;
+ opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat;
+
if (!options)
goto out;
@@ -1146,7 +1231,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
case Opt_time_offset:
if (match_int(&args[0], &option))
return -EINVAL;
- if (option < -12 * 60 || option > 12 * 60)
+ /*
+ * GMT+-12 zones may have DST corrections so at least
+ * 13 hours difference is needed. Make the limit 24
+ * just in case someone invents something unusual.
+ */
+ if (option < -24 * 60 || option > 24 * 60)
return -EINVAL;
opts->tz_set = 1;
opts->time_offset = option;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ee85cd4e136a..350a2c8cfd28 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -51,7 +51,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
if (arg & O_NDELAY)
arg |= O_NONBLOCK;
- if (arg & O_DIRECT) {
+ /* Pipe packetized mode is controlled by O_DIRECT flag */
+ if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) {
if (!filp->f_mapping || !filp->f_mapping->a_ops ||
!filp->f_mapping->a_ops->direct_IO)
return -EINVAL;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index d59712dfa3e7..ca3c3dd01789 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd,
path_put(&path);
return fd;
}
- file = file_open_root(path.dentry, path.mnt, "", open_flag);
+ file = file_open_root(path.dentry, path.mnt, "", open_flag, 0);
if (IS_ERR(file)) {
put_unused_fd(fd);
retval = PTR_ERR(file);
diff --git a/fs/file.c b/fs/file.c
index 39f8f15921da..1fbc5c0555a9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -25,9 +25,9 @@
int sysctl_nr_open __read_mostly = 1024*1024;
int sysctl_nr_open_min = BITS_PER_LONG;
-/* our max() is unusable in constant expressions ;-/ */
-#define __const_max(x, y) ((x) < (y) ? (x) : (y))
-int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
+/* our min() is unusable in constant expressions ;-/ */
+#define __const_min(x, y) ((x) < (y) ? (x) : (y))
+int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) &
-BITS_PER_LONG;
static void *alloc_fdmem(size_t size)
@@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size)
* vmalloc() if the allocation size will be considered "large" by the VM.
*/
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
+ void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
+ __GFP_NOWARN | __GFP_NORETRY);
if (data != NULL)
return data;
}
- return vmalloc(size);
+ return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
}
static void __free_fdtable(struct fdtable *fdt)
@@ -126,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
if (unlikely(nr > sysctl_nr_open))
nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
- fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+ fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
goto out;
fdt->max_fds = nr;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 5797d45a78cb..c5618db110be 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -46,9 +46,9 @@ void put_filesystem(struct file_system_type *fs)
static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
struct file_system_type **p;
- for (p=&file_systems; *p; p=&(*p)->next)
- if (strlen((*p)->name) == len &&
- strncmp((*p)->name, name, len) == 0)
+ for (p = &file_systems; *p; p = &(*p)->next)
+ if (strncmp((*p)->name, name, len) == 0 &&
+ !(*p)->name[len])
break;
return p;
}
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index cb84f0fcc72a..bfc780c682fb 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -66,11 +66,11 @@ static int
vxfs_immed_readpage(struct file *fp, struct page *pp)
{
struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host);
- u_int64_t offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT;
+ u_int64_t offset = (u_int64_t)pp->index << PAGE_SHIFT;
caddr_t kaddr;
kaddr = kmap(pp);
- memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE);
+ memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_SIZE);
kunmap(pp);
flush_dcache_page(pp);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ef73ed674a27..3e2ccade61ed 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -326,6 +326,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
} else if (S_ISLNK(ip->i_mode)) {
if (!VXFS_ISIMMED(vip)) {
ip->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(ip);
ip->i_mapping->a_ops = &vxfs_aops;
} else {
ip->i_op = &simple_symlink_inode_operations;
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 1cff72df0389..a49e0cfbb686 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -45,7 +45,7 @@
/*
* Number of VxFS blocks per page.
*/
-#define VXFS_BLOCK_PER_PAGE(sbp) ((PAGE_CACHE_SIZE / (sbp)->s_blocksize))
+#define VXFS_BLOCK_PER_PAGE(sbp) ((PAGE_SIZE / (sbp)->s_blocksize))
static struct dentry * vxfs_lookup(struct inode *, struct dentry *, unsigned int);
@@ -175,7 +175,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
if (de) {
ino = de->d_ino;
kunmap(pp);
- page_cache_release(pp);
+ put_page(pp);
}
return (ino);
@@ -255,8 +255,8 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx)
nblocks = dir_blocks(ip);
pblocks = VXFS_BLOCK_PER_PAGE(sbp);
- page = pos >> PAGE_CACHE_SHIFT;
- offset = pos & ~PAGE_CACHE_MASK;
+ page = pos >> PAGE_SHIFT;
+ offset = pos & ~PAGE_MASK;
block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
for (; page < npages; page++, block = 0) {
@@ -289,7 +289,7 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx)
continue;
offset = (char *)de - kaddr;
- ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+ ctx->pos = ((page << PAGE_SHIFT) | offset) + 2;
if (!dir_emit(ctx, de->d_name, de->d_namelen,
de->d_ino, DT_UNKNOWN)) {
vxfs_put_page(pp);
@@ -301,6 +301,6 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx)
vxfs_put_page(pp);
offset = 0;
}
- ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+ ctx->pos = ((page << PAGE_SHIFT) | offset) + 2;
return 0;
}
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 5d318c44f855..e806694d4145 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -50,7 +50,7 @@ inline void
vxfs_put_page(struct page *pp)
{
kunmap(pp);
- page_cache_release(pp);
+ put_page(pp);
}
/**
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 206a68b1db1a..592cea54cea0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -33,7 +33,7 @@
/*
* 4MB minimal write chunk size
*/
-#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
+#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10))
struct wb_completion {
atomic_t cnt;
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
/* one round can affect upto 5 slots */
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
void __inode_attach_wb(struct inode *inode, struct page *page)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -278,13 +281,15 @@ locked_inode_to_wb_and_lock_list(struct inode *inode)
wb_get(wb);
spin_unlock(&inode->i_lock);
spin_lock(&wb->list_lock);
- wb_put(wb); /* not gonna deref it anymore */
/* i_wb may have changed inbetween, can't use inode_to_wb() */
- if (likely(wb == inode->i_wb))
- return wb; /* @inode already has ref */
+ if (likely(wb == inode->i_wb)) {
+ wb_put(wb); /* @inode already has ref */
+ return wb;
+ }
spin_unlock(&wb->list_lock);
+ wb_put(wb);
cpu_relax();
spin_lock(&inode->i_lock);
}
@@ -424,6 +429,8 @@ skip_switch:
iput(inode);
kfree(isw);
+
+ atomic_dec(&isw_nr_in_flight);
}
static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -433,7 +440,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
/* needs to grab bh-unsafe locks, bounce to work item */
INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- schedule_work(&isw->work);
+ queue_work(isw_wq, &isw->work);
}
/**
@@ -469,7 +476,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
- if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+ if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+ inode->i_state & (I_WB_SWITCH | I_FREEING) ||
inode_to_wb(inode) == isw->new_wb) {
spin_unlock(&inode->i_lock);
goto out_free;
@@ -480,6 +488,8 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
ihold(inode);
isw->inode = inode;
+ atomic_inc(&isw_nr_in_flight);
+
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the mapping's
@@ -677,9 +687,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
if (!wbc->wb)
return;
- rcu_read_lock();
id = mem_cgroup_css_from_page(page)->id;
- rcu_read_unlock();
if (id == wbc->wb_id) {
wbc->wb_bytes += bytes;
@@ -842,6 +850,33 @@ restart:
wb_put(last_wb);
}
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches. An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished. As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+ if (atomic_read(&isw_nr_in_flight)) {
+ synchronize_rcu();
+ flush_workqueue(isw_wq);
+ }
+}
+
+static int __init cgroup_writeback_init(void)
+{
+ isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+ if (!isw_wq)
+ return -ENOMEM;
+ return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static struct bdi_writeback *
@@ -1304,10 +1339,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
* and does more profound writeback list handling in writeback_sb_inodes().
*/
-static int
-writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
- struct writeback_control *wbc)
+static int writeback_single_inode(struct inode *inode,
+ struct writeback_control *wbc)
{
+ struct bdi_writeback *wb;
int ret = 0;
spin_lock(&inode->i_lock);
@@ -1345,7 +1380,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
ret = __writeback_single_inode(inode, wbc);
wbc_detach_inode(wbc);
- spin_lock(&wb->list_lock);
+
+ wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
/*
* If inode is clean, remove it from writeback lists. Otherwise don't
@@ -1420,6 +1456,7 @@ static long writeback_sb_inodes(struct super_block *sb,
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
+ struct bdi_writeback *tmp_wb;
if (inode->i_sb != sb) {
if (work->sb) {
@@ -1510,15 +1547,23 @@ static long writeback_sb_inodes(struct super_block *sb,
cond_resched();
}
-
- spin_lock(&wb->list_lock);
+ /*
+ * Requeue @inode if still dirty. Be careful as @inode may
+ * have been switched to another wb in the meantime.
+ */
+ tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
- requeue_inode(inode, wb, &wbc);
+ requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
+ if (unlikely(tmp_wb != wb)) {
+ spin_unlock(&tmp_wb->list_lock);
+ spin_lock(&wb->list_lock);
+ }
+
/*
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
@@ -1981,9 +2026,9 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
* page->mapping->host, so the page-dirtying time is recorded in the internal
* blockdev inode.
*/
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
void __mark_inode_dirty(struct inode *inode, int flags)
{
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
struct super_block *sb = inode->i_sb;
int dirtytime;
@@ -2093,6 +2138,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
out_unlock_inode:
spin_unlock(&inode->i_lock);
+#undef I_DIRTY_INODE
}
EXPORT_SYMBOL(__mark_inode_dirty);
@@ -2304,7 +2350,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
*/
int write_inode_now(struct inode *inode, int sync)
{
- struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -2316,7 +2361,7 @@ int write_inode_now(struct inode *inode, int sync)
wbc.nr_to_write = 0;
might_sleep();
- return writeback_single_inode(inode, wb, &wbc);
+ return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(write_inode_now);
@@ -2333,7 +2378,7 @@ EXPORT_SYMBOL(write_inode_now);
*/
int sync_inode(struct inode *inode, struct writeback_control *wbc)
{
- return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
+ return writeback_single_inode(inode, wbc);
}
EXPORT_SYMBOL(sync_inode);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index d403c69bee08..4304072161aa 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -111,7 +111,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
/* radix tree insertion won't use the preallocation pool unless it's
* told it may not wait */
- INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
switch (cookie->def->type) {
case FSCACHE_COOKIE_TYPE_INDEX:
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index 6d941f56faf4..9b28649df3a1 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -22,6 +22,7 @@ static LIST_HEAD(fscache_netfs_list);
int __fscache_register_netfs(struct fscache_netfs *netfs)
{
struct fscache_netfs *ptr;
+ struct fscache_cookie *cookie;
int ret;
_enter("{%s}", netfs->name);
@@ -29,29 +30,25 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
INIT_LIST_HEAD(&netfs->link);
/* allocate a cookie for the primary index */
- netfs->primary_index =
- kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+ cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
- if (!netfs->primary_index) {
+ if (!cookie) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
/* initialise the primary index cookie */
- atomic_set(&netfs->primary_index->usage, 1);
- atomic_set(&netfs->primary_index->n_children, 0);
- atomic_set(&netfs->primary_index->n_active, 1);
+ atomic_set(&cookie->usage, 1);
+ atomic_set(&cookie->n_children, 0);
+ atomic_set(&cookie->n_active, 1);
- netfs->primary_index->def = &fscache_fsdef_netfs_def;
- netfs->primary_index->parent = &fscache_fsdef_index;
- netfs->primary_index->netfs_data = netfs;
- netfs->primary_index->flags = 1 << FSCACHE_COOKIE_ENABLED;
+ cookie->def = &fscache_fsdef_netfs_def;
+ cookie->parent = &fscache_fsdef_index;
+ cookie->netfs_data = netfs;
+ cookie->flags = 1 << FSCACHE_COOKIE_ENABLED;
- atomic_inc(&netfs->primary_index->parent->usage);
- atomic_inc(&netfs->primary_index->parent->n_children);
-
- spin_lock_init(&netfs->primary_index->lock);
- INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+ spin_lock_init(&cookie->lock);
+ INIT_HLIST_HEAD(&cookie->backing_objects);
/* check the netfs type is not already present */
down_write(&fscache_addremove_sem);
@@ -62,6 +59,10 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
goto already_registered;
}
+ atomic_inc(&cookie->parent->usage);
+ atomic_inc(&cookie->parent->n_children);
+
+ netfs->primary_index = cookie;
list_add(&netfs->link, &fscache_netfs_list);
ret = 0;
@@ -70,11 +71,8 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
already_registered:
up_write(&fscache_addremove_sem);
- if (ret < 0) {
- netfs->primary_index->parent = NULL;
- __fscache_cookie_put(netfs->primary_index);
- netfs->primary_index = NULL;
- }
+ if (ret < 0)
+ kmem_cache_free(fscache_cookie_jar, cookie);
_leave(" = %d", ret);
return ret;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 483bbc613bf0..3078b679fcd1 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -58,7 +58,7 @@ bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
/*
* decide whether a page can be released, possibly by cancelling a store to it
- * - we're allowed to sleep if __GFP_WAIT is flagged
+ * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged
*/
bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
struct page *page,
@@ -113,7 +113,7 @@ try_again:
wake_up_bit(&cookie->flags, 0);
if (xpage)
- page_cache_release(xpage);
+ put_page(xpage);
__fscache_uncache_page(cookie, page);
return true;
@@ -122,7 +122,7 @@ page_busy:
* allocator as the work threads writing to the cache may all end up
* sleeping on memory allocation, so we may need to impose a timeout
* too. */
- if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
+ if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) {
fscache_stat(&fscache_n_store_vmscan_busy);
return false;
}
@@ -132,7 +132,7 @@ page_busy:
_debug("fscache writeout timeout page: %p{%lx}",
page, page->index);
- gfp &= ~__GFP_WAIT;
+ gfp &= ~__GFP_DIRECT_RECLAIM;
goto try_again;
}
EXPORT_SYMBOL(__fscache_maybe_release_page);
@@ -164,7 +164,7 @@ static void fscache_end_page_write(struct fscache_object *object,
}
spin_unlock(&object->lock);
if (xpage)
- page_cache_release(xpage);
+ put_page(xpage);
}
/*
@@ -816,7 +816,7 @@ static void fscache_write_op(struct fscache_operation *_op)
goto superseded;
page = results[0];
_debug("gang %d [%lx]", n, page->index);
- if (page->index > op->store_limit) {
+ if (page->index >= op->store_limit) {
fscache_stat(&fscache_n_store_pages_over_limit);
goto superseded;
}
@@ -884,7 +884,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
spin_unlock(&cookie->stores_lock);
for (i = n - 1; i >= 0; i--)
- page_cache_release(results[i]);
+ put_page(results[i]);
}
_leave("");
@@ -982,7 +982,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
radix_tree_tag_set(&cookie->stores, page->index,
FSCACHE_COOKIE_PENDING_TAG);
- page_cache_get(page);
+ get_page(page);
/* we only want one writer at a time, but we do need to queue new
* writers after exclusive ops */
@@ -1026,7 +1026,7 @@ submit_failed:
radix_tree_delete(&cookie->stores, page->index);
spin_unlock(&cookie->stores_lock);
wake_cookie = __fscache_unuse_cookie(cookie);
- page_cache_release(page);
+ put_page(page);
ret = -ENOBUFS;
goto nobufs;
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index eae2c11268bc..c5b6b7165489 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt)
static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
{
- struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
loff_t pos = 0;
return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE);
@@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from)
{
- struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
loff_t pos = 0;
/*
* No locking or generic_write_checks(), the server is
@@ -549,6 +549,8 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
unregister_chrdev_region(cc->cdev->dev, 1);
cdev_del(cc->cdev);
}
+ /* Base reference is now owned by "fud" */
+ fuse_conn_put(&cc->fc);
rc = fuse_dev_release(inode, file); /* puts the base reference */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ebb5e37455a0..cbece1221417 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -897,7 +897,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
return err;
}
- page_cache_get(newpage);
+ get_page(newpage);
if (!(buf->flags & PIPE_BUF_FLAG_LRU))
lru_cache_add_file(newpage);
@@ -912,12 +912,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (err) {
unlock_page(newpage);
- page_cache_release(newpage);
+ put_page(newpage);
return err;
}
unlock_page(oldpage);
- page_cache_release(oldpage);
+ put_page(oldpage);
cs->len = 0;
return 0;
@@ -951,7 +951,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
fuse_copy_finish(cs);
buf = cs->pipebufs;
- page_cache_get(page);
+ get_page(page);
buf->page = page;
buf->offset = offset;
buf->len = count;
@@ -1435,7 +1435,7 @@ out_unlock:
out:
for (; page_nr < cs.nr_segs; page_nr++)
- page_cache_release(bufs[page_nr].page);
+ put_page(bufs[page_nr].page);
kfree(bufs);
return ret;
@@ -1632,8 +1632,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
goto out_up_killsb;
mapping = inode->i_mapping;
- index = outarg.offset >> PAGE_CACHE_SHIFT;
- offset = outarg.offset & ~PAGE_CACHE_MASK;
+ index = outarg.offset >> PAGE_SHIFT;
+ offset = outarg.offset & ~PAGE_MASK;
file_size = i_size_read(inode);
end = outarg.offset + outarg.size;
if (end > file_size) {
@@ -1652,13 +1652,13 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
if (!page)
goto out_iput;
- this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+ this_num = min_t(unsigned, num, PAGE_SIZE - offset);
err = fuse_copy_page(cs, &page, offset, this_num, 0);
if (!err && offset == 0 &&
- (this_num == PAGE_CACHE_SIZE || file_size == end))
+ (this_num == PAGE_SIZE || file_size == end))
SetPageUptodate(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (err)
goto out_iput;
@@ -1697,7 +1697,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
size_t total_len = 0;
int num_pages;
- offset = outarg->offset & ~PAGE_CACHE_MASK;
+ offset = outarg->offset & ~PAGE_MASK;
file_size = i_size_read(inode);
num = outarg->size;
@@ -1720,7 +1720,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
req->page_descs[0].offset = offset;
req->end = fuse_retrieve_end;
- index = outarg->offset >> PAGE_CACHE_SHIFT;
+ index = outarg->offset >> PAGE_SHIFT;
while (num && req->num_pages < num_pages) {
struct page *page;
@@ -1730,7 +1730,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
if (!page)
break;
- this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+ this_num = min_t(unsigned, num, PAGE_SIZE - offset);
req->pages[req->num_pages] = page;
req->page_descs[req->num_pages].length = this_num;
req->num_pages++;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5e2e08712d3b..4b855b65d457 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -944,7 +944,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
if (!parent)
return -ENOENT;
- mutex_lock(&parent->i_mutex);
+ inode_lock(parent);
if (!S_ISDIR(parent->i_mode))
goto unlock;
@@ -962,7 +962,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
fuse_invalidate_entry(entry);
if (child_nodeid != 0 && d_really_is_positive(entry)) {
- mutex_lock(&d_inode(entry)->i_mutex);
+ inode_lock(d_inode(entry));
if (get_node_id(d_inode(entry)) != child_nodeid) {
err = -ENOENT;
goto badentry;
@@ -983,7 +983,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
clear_nlink(d_inode(entry));
err = 0;
badentry:
- mutex_unlock(&d_inode(entry)->i_mutex);
+ inode_unlock(d_inode(entry));
if (!err)
d_delete(entry);
} else {
@@ -992,7 +992,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
dput(entry);
unlock:
- mutex_unlock(&parent->i_mutex);
+ inode_unlock(parent);
iput(parent);
return err;
}
@@ -1365,15 +1365,19 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
return err;
}
-static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
+static const char *fuse_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct fuse_conn *fc = get_fuse_conn(inode);
FUSE_ARGS(args);
char *link;
ssize_t ret;
- link = (char *) __get_free_page(GFP_KERNEL);
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ link = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!link)
return ERR_PTR(-ENOMEM);
@@ -1385,11 +1389,11 @@ static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
args.out.args[0].value = link;
ret = fuse_simple_request(fc, &args);
if (ret < 0) {
- free_page((unsigned long) link);
+ kfree(link);
link = ERR_PTR(ret);
} else {
link[ret] = '\0';
- *cookie = link;
+ set_delayed_call(done, kfree_link, link);
}
fuse_invalidate_atime(inode);
return link;
@@ -1500,7 +1504,7 @@ void fuse_set_nowrite(struct inode *inode)
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
- BUG_ON(!mutex_is_locked(&inode->i_mutex));
+ BUG_ON(!inode_is_locked(inode));
spin_lock(&fc->lock);
BUG_ON(fi->writectr < 0);
@@ -1909,8 +1913,7 @@ static const struct inode_operations fuse_common_inode_operations = {
static const struct inode_operations fuse_symlink_inode_operations = {
.setattr = fuse_setattr,
- .follow_link = fuse_follow_link,
- .put_link = free_page_put_link,
+ .get_link = fuse_get_link,
.readlink = generic_readlink,
.getattr = fuse_getattr,
.setxattr = fuse_setxattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e0faf8f2c868..719924d6c706 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -207,7 +207,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
return err;
if (lock_inode)
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = fuse_do_open(fc, get_node_id(inode), file, isdir);
@@ -215,7 +215,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
fuse_finish_open(inode, file);
if (lock_inode)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
@@ -348,7 +348,7 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
pgoff_t curr_index;
BUG_ON(req->inode != inode);
- curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ curr_index = req->misc.write.in.offset >> PAGE_SHIFT;
if (idx_from < curr_index + req->num_pages &&
curr_index <= idx_to) {
found = true;
@@ -413,9 +413,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
fuse_sync_writes(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
req = fuse_get_req_nofail_nopages(fc, file);
memset(&inarg, 0, sizeof(inarg));
@@ -450,7 +450,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
if (is_bad_inode(inode))
return -EIO;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* Start writeback against all dirty pages of the inode, then
@@ -486,7 +486,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
err = 0;
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
@@ -528,6 +528,11 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
}
}
+static void fuse_io_release(struct kref *kref)
+{
+ kfree(container_of(kref, struct fuse_io_priv, refcnt));
+}
+
static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
{
if (io->err)
@@ -585,8 +590,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
}
io->iocb->ki_complete(io->iocb, res, 0);
- kfree(io);
}
+
+ kref_put(&io->refcnt, fuse_io_release);
}
static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
@@ -613,6 +619,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
size_t num_bytes, struct fuse_io_priv *io)
{
spin_lock(&io->lock);
+ kref_get(&io->refcnt);
io->size += num_bytes;
io->reqs++;
spin_unlock(&io->lock);
@@ -676,11 +683,11 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode,
* present there.
*/
int i;
- int start_idx = num_read >> PAGE_CACHE_SHIFT;
- size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+ int start_idx = num_read >> PAGE_SHIFT;
+ size_t off = num_read & (PAGE_SIZE - 1);
for (i = start_idx; i < req->num_pages; i++) {
- zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
+ zero_user_segment(req->pages[i], off, PAGE_SIZE);
off = 0;
}
} else {
@@ -691,13 +698,13 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode,
static int fuse_do_readpage(struct file *file, struct page *page)
{
- struct fuse_io_priv io = { .async = 0, .file = file };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
struct inode *inode = page->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
size_t num_read;
loff_t pos = page_offset(page);
- size_t count = PAGE_CACHE_SIZE;
+ size_t count = PAGE_SIZE;
u64 attr_ver;
int err;
@@ -782,7 +789,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
else
SetPageError(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
if (req->ff)
fuse_file_put(req->ff, false);
@@ -793,7 +800,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
loff_t pos = page_offset(req->pages[0]);
- size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+ size_t count = req->num_pages << PAGE_SHIFT;
req->out.argpages = 1;
req->out.page_zeroing = 1;
@@ -829,7 +836,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
if (req->num_pages &&
(req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
- (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
+ (req->num_pages + 1) * PAGE_SIZE > fc->max_read ||
req->pages[req->num_pages - 1]->index + 1 != page->index)) {
int nr_alloc = min_t(unsigned, data->nr_pages,
FUSE_MAX_PAGES_PER_REQ);
@@ -851,7 +858,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
return -EIO;
}
- page_cache_get(page);
+ get_page(page);
req->pages[req->num_pages] = page;
req->page_descs[req->num_pages].length = PAGE_SIZE;
req->num_pages++;
@@ -984,7 +991,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
size_t res;
unsigned offset;
unsigned i;
- struct fuse_io_priv io = { .async = 0, .file = file };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
for (i = 0; i < req->num_pages; i++)
fuse_wait_on_page_writeback(inode, req->pages[i]->index);
@@ -996,17 +1003,17 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
for (i = 0; i < req->num_pages; i++) {
struct page *page = req->pages[i];
- if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
+ if (!req->out.h.error && !offset && count >= PAGE_SIZE)
SetPageUptodate(page);
- if (count > PAGE_CACHE_SIZE - offset)
- count -= PAGE_CACHE_SIZE - offset;
+ if (count > PAGE_SIZE - offset)
+ count -= PAGE_SIZE - offset;
else
count = 0;
offset = 0;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return res;
@@ -1017,7 +1024,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
struct iov_iter *ii, loff_t pos)
{
struct fuse_conn *fc = get_fuse_conn(mapping->host);
- unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned offset = pos & (PAGE_SIZE - 1);
size_t count = 0;
int err;
@@ -1027,8 +1034,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
do {
size_t tmp;
struct page *page;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
+ pgoff_t index = pos >> PAGE_SHIFT;
+ size_t bytes = min_t(size_t, PAGE_SIZE - offset,
iov_iter_count(ii));
bytes = min_t(size_t, bytes, fc->max_write - count);
@@ -1049,9 +1056,10 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
flush_dcache_page(page);
+ iov_iter_advance(ii, tmp);
if (!tmp) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
bytes = min(bytes, iov_iter_single_seg_count(ii));
goto again;
}
@@ -1061,11 +1069,10 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
req->page_descs[req->num_pages].length = tmp;
req->num_pages++;
- iov_iter_advance(ii, tmp);
count += tmp;
pos += tmp;
offset += tmp;
- if (offset == PAGE_CACHE_SIZE)
+ if (offset == PAGE_SIZE)
offset = 0;
if (!fc->big_writes)
@@ -1079,8 +1086,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
{
return min_t(unsigned,
- ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
- (pos >> PAGE_CACHE_SHIFT) + 1,
+ ((pos + len - 1) >> PAGE_SHIFT) -
+ (pos >> PAGE_SHIFT) + 1,
FUSE_MAX_PAGES_PER_REQ);
}
@@ -1160,7 +1167,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
return generic_file_write_iter(iocb, from);
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
@@ -1198,8 +1205,8 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
invalidate_mapping_pages(file->f_mapping,
- pos >> PAGE_CACHE_SHIFT,
- endbyte >> PAGE_CACHE_SHIFT);
+ pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
@@ -1210,7 +1217,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
out:
current->backing_dev_info = NULL;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return written ? written : err;
}
@@ -1240,6 +1247,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
size_t *nbytesp, int write)
{
size_t nbytes = 0; /* # bytes already packed in req */
+ ssize_t ret = 0;
/* Special case for kernel I/O: can copy directly into the buffer */
if (ii->type & ITER_KVEC) {
@@ -1259,13 +1267,12 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
unsigned npages;
size_t start;
- ssize_t ret = iov_iter_get_pages(ii,
- &req->pages[req->num_pages],
+ ret = iov_iter_get_pages(ii, &req->pages[req->num_pages],
*nbytesp - nbytes,
req->max_pages - req->num_pages,
&start);
if (ret < 0)
- return ret;
+ break;
iov_iter_advance(ii, ret);
nbytes += ret;
@@ -1288,7 +1295,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
*nbytesp = nbytes;
- return 0;
+ return ret;
}
static inline int fuse_iter_npages(const struct iov_iter *ii_p)
@@ -1308,10 +1315,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
size_t count = iov_iter_count(iter);
- pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
- pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+ pgoff_t idx_from = pos >> PAGE_SHIFT;
+ pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
ssize_t res = 0;
struct fuse_req *req;
+ int err = 0;
if (io->async)
req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
@@ -1322,21 +1330,19 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
if (!write)
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
fuse_sync_writes(inode);
if (!write)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
while (count) {
size_t nres;
fl_owner_t owner = current->files;
size_t nbytes = min(count, nmax);
- int err = fuse_get_user_pages(req, iter, &nbytes, write);
- if (err) {
- res = err;
+ err = fuse_get_user_pages(req, iter, &nbytes, write);
+ if (err && !nbytes)
break;
- }
if (write)
nres = fuse_send_write(req, io, pos, nbytes, owner);
@@ -1346,11 +1352,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (!io->async)
fuse_release_user_pages(req, !write);
if (req->out.h.error) {
- if (!res)
- res = req->out.h.error;
+ err = req->out.h.error;
break;
} else if (nres > nbytes) {
- res = -EIO;
+ res = 0;
+ err = -EIO;
break;
}
count -= nres;
@@ -1374,7 +1380,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
if (res > 0)
*ppos = pos;
- return res;
+ return res > 0 ? res : err;
}
EXPORT_SYMBOL_GPL(fuse_direct_io);
@@ -1398,7 +1404,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp);
return __fuse_direct_read(&io, to, &iocb->ki_pos);
}
@@ -1406,21 +1412,21 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
- struct fuse_io_priv io = { .async = 0, .file = file };
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
ssize_t res;
if (is_bad_inode(inode))
return -EIO;
/* Don't allow parallel writes to the same file */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
res = generic_write_checks(iocb, from);
if (res > 0)
res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
fuse_invalidate_attr(inode);
if (res > 0)
fuse_write_update_size(inode, iocb->ki_pos);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return res;
}
@@ -1460,7 +1466,7 @@ __acquires(fc->lock)
{
struct fuse_inode *fi = get_fuse_inode(req->inode);
struct fuse_write_in *inarg = &req->misc.write.in;
- __u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
+ __u64 data_size = req->num_pages * PAGE_SIZE;
if (!fc->connected)
goto out_free;
@@ -1721,7 +1727,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
list_del(&new_req->writepages_entry);
list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
BUG_ON(old_req->inode != new_req->inode);
- curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT;
if (curr_index <= page->index &&
page->index < curr_index + old_req->num_pages) {
found = true;
@@ -1736,7 +1742,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
new_req->num_pages = 1;
for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
BUG_ON(tmp->inode != new_req->inode);
- curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT;
if (tmp->num_pages == 1 &&
curr_index == page->index) {
old_req = tmp;
@@ -1793,7 +1799,7 @@ static int fuse_writepages_fill(struct page *page,
if (req && req->num_pages &&
(is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
- (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+ (req->num_pages + 1) * PAGE_SIZE > fc->max_write ||
data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
fuse_writepages_send(data);
data->req = NULL;
@@ -1918,7 +1924,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct fuse_conn *fc = get_fuse_conn(file_inode(file));
struct page *page;
loff_t fsize;
@@ -1932,15 +1938,15 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
fuse_wait_on_page_writeback(mapping->host, page->index);
- if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
+ if (PageUptodate(page) || len == PAGE_SIZE)
goto success;
/*
* Check if the start this page comes after the end of file, in which
* case the readpage can be optimized away.
*/
fsize = i_size_read(mapping->host);
- if (fsize <= (pos & PAGE_CACHE_MASK)) {
- size_t off = pos & ~PAGE_CACHE_MASK;
+ if (fsize <= (pos & PAGE_MASK)) {
+ size_t off = pos & ~PAGE_MASK;
if (off)
zero_user_segment(page, 0, off);
goto success;
@@ -1954,7 +1960,7 @@ success:
cleanup:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
error:
return err;
}
@@ -1967,16 +1973,16 @@ static int fuse_write_end(struct file *file, struct address_space *mapping,
if (!PageUptodate(page)) {
/* Zero any unwritten bytes at the end of the page */
- size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
+ size_t endoff = (pos + copied) & ~PAGE_MASK;
if (endoff)
- zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
+ zero_user_segment(page, endoff, PAGE_SIZE);
SetPageUptodate(page);
}
fuse_write_update_size(inode, pos + copied);
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
@@ -2231,20 +2237,77 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
return err ? 0 : outarg.block;
}
+static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_file *ff = file->private_data;
+ FUSE_ARGS(args);
+ struct fuse_lseek_in inarg = {
+ .fh = ff->fh,
+ .offset = offset,
+ .whence = whence
+ };
+ struct fuse_lseek_out outarg;
+ int err;
+
+ if (fc->no_lseek)
+ goto fallback;
+
+ args.in.h.opcode = FUSE_LSEEK;
+ args.in.h.nodeid = ff->nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
+ if (err) {
+ if (err == -ENOSYS) {
+ fc->no_lseek = 1;
+ goto fallback;
+ }
+ return err;
+ }
+
+ return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
+
+fallback:
+ err = fuse_update_attributes(inode, NULL, file, NULL);
+ if (!err)
+ return generic_file_llseek(file, offset, whence);
+ else
+ return err;
+}
+
static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
{
loff_t retval;
struct inode *inode = file_inode(file);
- /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
- if (whence == SEEK_CUR || whence == SEEK_SET)
- return generic_file_llseek(file, offset, whence);
-
- mutex_lock(&inode->i_mutex);
- retval = fuse_update_attributes(inode, NULL, file, NULL);
- if (!retval)
+ switch (whence) {
+ case SEEK_SET:
+ case SEEK_CUR:
+ /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
retval = generic_file_llseek(file, offset, whence);
- mutex_unlock(&inode->i_mutex);
+ break;
+ case SEEK_END:
+ inode_lock(inode);
+ retval = fuse_update_attributes(inode, NULL, file, NULL);
+ if (!retval)
+ retval = generic_file_llseek(file, offset, whence);
+ inode_unlock(inode);
+ break;
+ case SEEK_HOLE:
+ case SEEK_DATA:
+ inode_lock(inode);
+ retval = fuse_lseek(file, offset, whence);
+ inode_unlock(inode);
+ break;
+ default:
+ retval = -EINVAL;
+ }
return retval;
}
@@ -2786,6 +2849,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
loff_t i_size;
size_t count = iov_iter_count(iter);
struct fuse_io_priv *io;
+ bool is_sync = is_sync_kiocb(iocb);
pos = offset;
inode = file->f_mapping->host;
@@ -2806,6 +2870,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
if (!io)
return -ENOMEM;
spin_lock_init(&io->lock);
+ kref_init(&io->refcnt);
io->reqs = 1;
io->bytes = -1;
io->size = 0;
@@ -2825,12 +2890,18 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
* to wait on real async I/O requests, so we must submit this request
* synchronously.
*/
- if (!is_sync_kiocb(iocb) && (offset + count > i_size) &&
+ if (!is_sync && (offset + count > i_size) &&
iov_iter_rw(iter) == WRITE)
io->async = false;
- if (io->async && is_sync_kiocb(iocb))
+ if (io->async && is_sync) {
+ /*
+ * Additional reference to keep io around after
+ * calling fuse_aio_complete()
+ */
+ kref_get(&io->refcnt);
io->done = &wait;
+ }
if (iov_iter_rw(iter) == WRITE) {
ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
@@ -2843,14 +2914,14 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
/* we have a non-extending, async request, so return */
- if (!is_sync_kiocb(iocb))
+ if (!is_sync)
return -EIOCBQUEUED;
wait_for_completion(&wait);
ret = fuse_get_res_by_io(io);
}
- kfree(io);
+ kref_put(&io->refcnt, fuse_io_release);
if (iov_iter_rw(iter) == WRITE) {
if (ret > 0)
@@ -2887,7 +2958,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
return -EOPNOTSUPP;
if (lock_inode) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (mode & FALLOC_FL_PUNCH_HOLE) {
loff_t endbyte = offset + length - 1;
err = filemap_write_and_wait_range(inode->i_mapping,
@@ -2933,7 +3004,7 @@ out:
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
if (lock_inode)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 405113101db8..eddbe02c4028 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -22,6 +22,7 @@
#include <linux/rbtree.h>
#include <linux/poll.h>
#include <linux/workqueue.h>
+#include <linux/kref.h>
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
@@ -243,6 +244,7 @@ struct fuse_args {
/** The request IO state (for asynchronous processing) */
struct fuse_io_priv {
+ struct kref refcnt;
int async;
spinlock_t lock;
unsigned reqs;
@@ -256,6 +258,13 @@ struct fuse_io_priv {
struct completion *done;
};
+#define FUSE_IO_PRIV_SYNC(f) \
+{ \
+ .refcnt = { ATOMIC_INIT(1) }, \
+ .async = 0, \
+ .file = f, \
+}
+
/**
* Request flags
*
@@ -605,6 +614,9 @@ struct fuse_conn {
/** Does the filesystem support asynchronous direct-IO submission? */
unsigned async_dio:1;
+ /** Is lseek not implemented by fs? */
+ unsigned no_lseek:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2913db2a5b99..1ce67668a8e1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -339,11 +339,11 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
fuse_invalidate_attr(inode);
if (offset >= 0) {
- pg_start = offset >> PAGE_CACHE_SHIFT;
+ pg_start = offset >> PAGE_SHIFT;
if (len <= 0)
pg_end = -1;
else
- pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+ pg_end = (offset + len - 1) >> PAGE_SHIFT;
invalidate_inode_pages2_range(inode->i_mapping,
pg_start, pg_end);
}
@@ -864,7 +864,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
process_init_limits(fc, arg);
if (arg->minor >= 6) {
- ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
+ ra_pages = arg->max_readahead / PAGE_SIZE;
if (arg->flags & FUSE_ASYNC_READ)
fc->async_read = 1;
if (!(arg->flags & FUSE_POSIX_LOCKS))
@@ -901,7 +901,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
if (arg->time_gran && arg->time_gran <= 1000000000)
fc->sb->s_time_gran = arg->time_gran;
} else {
- ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+ ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
fc->no_flock = 1;
}
@@ -922,7 +922,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->major = FUSE_KERNEL_VERSION;
arg->minor = FUSE_KERNEL_MINOR_VERSION;
- arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
+ arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE;
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
@@ -955,7 +955,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
int err;
fc->bdi.name = "fuse";
- fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
/* fuse does it's own writeback accounting */
fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
@@ -1053,8 +1053,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
goto err;
#endif
} else {
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
}
sb->s_magic = FUSE_SUPER_MAGIC;
sb->s_op = &fuse_super_operations;
@@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void)
int err;
fuse_inode_cachep = kmem_cache_create("fuse_inode",
- sizeof(struct fuse_inode),
- 0, SLAB_HWCACHE_ALIGN,
+ sizeof(struct fuse_inode), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
fuse_inode_init_once);
err = -ENOMEM;
if (!fuse_inode_cachep)
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1be3b061c05c..791932617d1a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -31,9 +31,9 @@ static const char *gfs2_acl_name(int type)
{
switch (type) {
case ACL_TYPE_ACCESS:
- return GFS2_POSIX_ACL_ACCESS;
+ return XATTR_POSIX_ACL_ACCESS;
case ACL_TYPE_DEFAULT:
- return GFS2_POSIX_ACL_DEFAULT;
+ return XATTR_POSIX_ACL_DEFAULT;
}
return NULL;
}
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 2d65ec4cd4be..3af4f407a483 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -12,8 +12,6 @@
#include "incore.h"
-#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
-#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1caee0534587..1bbbee945f46 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -101,7 +101,7 @@ static int gfs2_writepage_common(struct page *page,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned offset;
if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
@@ -109,9 +109,9 @@ static int gfs2_writepage_common(struct page *page,
if (current->journal_info)
goto redirty;
/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = i_size & (PAGE_SIZE-1);
if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
goto out;
}
return 1;
@@ -238,7 +238,7 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct gfs2_sbd *sdp = GFS2_SB(inode);
- unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
+ unsigned nrblocks = nr_pages * (PAGE_SIZE/inode->i_sb->s_blocksize);
int i;
int ret;
@@ -366,8 +366,8 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
cycled = 0;
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
@@ -458,7 +458,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
* so we need to supply one here. It doesn't happen often.
*/
if (unlikely(page->index)) {
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
SetPageUptodate(page);
return 0;
}
@@ -471,7 +471,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
- memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
+ memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap_atomic(kaddr);
flush_dcache_page(page);
brelse(dibh);
@@ -560,8 +560,8 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
unsigned size)
{
struct address_space *mapping = ip->i_inode.i_mapping;
- unsigned long index = *pos / PAGE_CACHE_SIZE;
- unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
+ unsigned long index = *pos / PAGE_SIZE;
+ unsigned offset = *pos & (PAGE_SIZE - 1);
unsigned copied = 0;
unsigned amt;
struct page *page;
@@ -569,15 +569,15 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
do {
amt = size - copied;
- if (offset + size > PAGE_CACHE_SIZE)
- amt = PAGE_CACHE_SIZE - offset;
+ if (offset + size > PAGE_SIZE)
+ amt = PAGE_SIZE - offset;
page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
if (IS_ERR(page))
return PTR_ERR(page);
p = kmap_atomic(page);
memcpy(buf + copied, p + offset, amt);
kunmap_atomic(p);
- page_cache_release(page);
+ put_page(page);
copied += amt;
index++;
offset = 0;
@@ -651,8 +651,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
unsigned requested = 0;
int alloc_required;
int error = 0;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ pgoff_t index = pos >> PAGE_SHIFT;
+ unsigned from = pos & (PAGE_SIZE - 1);
struct page *page;
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -697,7 +697,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
rblocks += gfs2_rg_blocks(ip, requested);
error = gfs2_trans_begin(sdp, rblocks,
- PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+ PAGE_SIZE/sdp->sd_sb.sb_bsize);
if (error)
goto out_trans_fail;
@@ -727,7 +727,7 @@ out:
return 0;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
gfs2_trans_end(sdp);
if (pos + len > ip->i_inode.i_size)
@@ -827,7 +827,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
if (!PageUptodate(page))
SetPageUptodate(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (copied) {
if (inode->i_size < to)
@@ -877,7 +877,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
struct buffer_head *dibh;
- unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned int from = pos & (PAGE_SIZE - 1);
unsigned int to = from + len;
int ret;
struct gfs2_trans *tr = current->journal_info;
@@ -888,7 +888,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
ret = gfs2_meta_inode_buffer(ip, &dibh);
if (unlikely(ret)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto failed;
}
@@ -914,7 +914,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
failed:
gfs2_trans_end(sdp);
gfs2_inplace_release(ip);
- if (ip->i_res->rs_qa_qd_num)
+ if (ip->i_qadata && ip->i_qadata->qa_qd_num)
gfs2_quota_unlock(ip);
if (inode == sdp->sd_rindex) {
gfs2_glock_dq(&m_ip->i_gh);
@@ -992,7 +992,7 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset,
{
struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
unsigned int stop = offset + length;
- int partial_page = (offset || length < PAGE_CACHE_SIZE);
+ int partial_page = (offset || length < PAGE_SIZE);
struct buffer_head *bh, *head;
unsigned long pos = 0;
@@ -1082,7 +1082,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* the first place, mapping->nr_pages will always be zero.
*/
if (mapping->nrpages) {
- loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
+ loff_t lstart = offset & ~(PAGE_SIZE - 1);
loff_t len = iov_iter_count(iter);
loff_t end = PAGE_ALIGN(offset + len) - 1;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 61296ecbd0e2..24ce1cdd434a 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,7 +75,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
dsize = dibh->b_size - sizeof(struct gfs2_dinode);
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
- memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
+ memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap(page);
SetPageUptodate(page);
@@ -98,7 +98,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
if (release) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return 0;
@@ -787,8 +787,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
if (error)
goto out_rlist;
- if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
- gfs2_rs_deltree(ip->i_res);
+ if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
+ gfs2_rs_deltree(&ip->i_res);
error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
RES_INDIRECT + RES_STATFS + RES_QUOTA,
@@ -932,8 +932,8 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
{
struct inode *inode = mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- unsigned long index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned long index = from >> PAGE_SHIFT;
+ unsigned offset = from & (PAGE_SIZE-1);
unsigned blocksize, iblock, length, pos;
struct buffer_head *bh;
struct page *page;
@@ -945,7 +945,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
- iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
@@ -989,7 +989,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
mark_buffer_dirty(bh);
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -1291,13 +1291,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
if (ret)
return ret;
- ret = get_write_access(inode);
- if (ret)
- return ret;
-
inode_dio_wait(inode);
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
goto out;
@@ -1307,10 +1303,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
goto out;
}
- gfs2_rs_deltree(ip->i_res);
ret = do_shrink(inode, oldsize, newsize);
out:
- put_write_access(inode);
+ gfs2_rsqa_delete(ip, NULL);
return ret;
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 487527b42d94..4a01f30e9995 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,6 +82,8 @@
#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+#define GFS2_HASH_INDEX_MASK 0xffffc000
+#define GFS2_USE_HASH_FLAG 0x2000
struct qstr gfs2_qdot __read_mostly;
struct qstr gfs2_qdotdot __read_mostly;
@@ -108,7 +110,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
struct buffer_head *bh;
int error;
- error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh);
if (error)
return error;
if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
@@ -305,7 +307,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
BUG_ON(extlen < 1);
bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
} else {
- error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh);
if (error)
goto fail;
}
@@ -388,8 +390,13 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
*/
void gfs2_dir_hash_inval(struct gfs2_inode *ip)
{
- __be64 *hc = ip->i_hash_cache;
+ __be64 *hc;
+
+ spin_lock(&ip->i_inode.i_lock);
+ hc = ip->i_hash_cache;
ip->i_hash_cache = NULL;
+ spin_unlock(&ip->i_inode.i_lock);
+
kvfree(hc);
}
@@ -438,6 +445,27 @@ static int gfs2_dirent_last(const struct gfs2_dirent *dent,
return 0;
}
+/* Look for the dirent that contains the offset specified in data. Once we
+ * find that dirent, there must be space available there for the new dirent */
+static int gfs2_dirent_find_offset(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *ptr)
+{
+ unsigned required = GFS2_DIRENT_SIZE(name->len);
+ unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+ unsigned totlen = be16_to_cpu(dent->de_rec_len);
+
+ if (ptr < (void *)dent || ptr >= (void *)dent + totlen)
+ return 0;
+ if (gfs2_dirent_sentinel(dent))
+ actual = 0;
+ if (ptr < (void *)dent + actual)
+ return -1;
+ if ((void *)dent + totlen >= ptr + required)
+ return 1;
+ return -1;
+}
+
static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
const struct qstr *name,
void *opaque)
@@ -677,6 +705,27 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
prev->de_rec_len = cpu_to_be16(prev_rec_len);
}
+
+static struct gfs2_dirent *do_init_dirent(struct inode *inode,
+ struct gfs2_dirent *dent,
+ const struct qstr *name,
+ struct buffer_head *bh,
+ unsigned offset)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_dirent *ndent;
+ unsigned totlen;
+
+ totlen = be16_to_cpu(dent->de_rec_len);
+ BUG_ON(offset + name->len > totlen);
+ gfs2_trans_add_meta(ip->i_gl, bh);
+ ndent = (struct gfs2_dirent *)((char *)dent + offset);
+ dent->de_rec_len = cpu_to_be16(offset);
+ gfs2_qstr2dirent(name, totlen - offset, ndent);
+ return ndent;
+}
+
+
/*
* Takes a dent from which to grab space as an argument. Returns the
* newly created dent.
@@ -686,31 +735,25 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
const struct qstr *name,
struct buffer_head *bh)
{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_dirent *ndent;
- unsigned offset = 0, totlen;
+ unsigned offset = 0;
if (!gfs2_dirent_sentinel(dent))
offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
- totlen = be16_to_cpu(dent->de_rec_len);
- BUG_ON(offset + name->len > totlen);
- gfs2_trans_add_meta(ip->i_gl, bh);
- ndent = (struct gfs2_dirent *)((char *)dent + offset);
- dent->de_rec_len = cpu_to_be16(offset);
- gfs2_qstr2dirent(name, totlen - offset, ndent);
- return ndent;
+ return do_init_dirent(inode, dent, name, bh, offset);
}
-static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
- struct buffer_head *bh,
- const struct qstr *name)
+static struct gfs2_dirent *gfs2_dirent_split_alloc(struct inode *inode,
+ struct buffer_head *bh,
+ const struct qstr *name,
+ void *ptr)
{
struct gfs2_dirent *dent;
dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
- gfs2_dirent_find_space, name, NULL);
+ gfs2_dirent_find_offset, name, ptr);
if (!dent || IS_ERR(dent))
return dent;
- return gfs2_init_dirent(inode, dent, name, bh);
+ return do_init_dirent(inode, dent, name, bh,
+ (unsigned)(ptr - (void *)dent));
}
static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
@@ -718,7 +761,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
{
int error;
- error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+ error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp);
if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
/* pr_info("block num=%llu\n", leaf_no); */
error = -EIO;
@@ -755,7 +798,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index,
int error;
error = get_leaf_nr(dip, index, &leaf_no);
- if (!error)
+ if (!IS_ERR_VALUE(error))
error = get_leaf(dip, leaf_no, bh_out);
return error;
@@ -971,7 +1014,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
index = name->hash >> (32 - dip->i_depth);
error = get_leaf_nr(dip, index, &leaf_no);
- if (error)
+ if (IS_ERR_VALUE(error))
return error;
/* Get the old leaf block */
@@ -1046,10 +1089,11 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
if (!gfs2_dirent_sentinel(dent) &&
be32_to_cpu(dent->de_hash) < divider) {
struct qstr str;
+ void *ptr = ((char *)dent - obh->b_data) + nbh->b_data;
str.name = (char*)(dent+1);
str.len = be16_to_cpu(dent->de_name_len);
str.hash = be32_to_cpu(dent->de_hash);
- new = gfs2_dirent_alloc(inode, nbh, &str);
+ new = gfs2_dirent_split_alloc(inode, nbh, &str, ptr);
if (IS_ERR(new)) {
error = PTR_ERR(new);
break;
@@ -1181,10 +1225,10 @@ static int compare_dents(const void *a, const void *b)
int ret = 0;
dent_a = *(const struct gfs2_dirent **)a;
- hash_a = be32_to_cpu(dent_a->de_hash);
+ hash_a = dent_a->de_cookie;
dent_b = *(const struct gfs2_dirent **)b;
- hash_b = be32_to_cpu(dent_b->de_hash);
+ hash_b = dent_b->de_cookie;
if (hash_a > hash_b)
ret = 1;
@@ -1222,19 +1266,20 @@ static int compare_dents(const void *a, const void *b)
*/
static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
- const struct gfs2_dirent **darr, u32 entries,
- int *copied)
+ struct gfs2_dirent **darr, u32 entries,
+ u32 sort_start, int *copied)
{
const struct gfs2_dirent *dent, *dent_next;
u64 off, off_next;
unsigned int x, y;
int run = 0;
- sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+ if (sort_start < entries)
+ sort(&darr[sort_start], entries - sort_start,
+ sizeof(struct gfs2_dirent *), compare_dents, NULL);
dent_next = darr[0];
- off_next = be32_to_cpu(dent_next->de_hash);
- off_next = gfs2_disk_hash2offset(off_next);
+ off_next = dent_next->de_cookie;
for (x = 0, y = 1; x < entries; x++, y++) {
dent = dent_next;
@@ -1242,8 +1287,7 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
if (y < entries) {
dent_next = darr[y];
- off_next = be32_to_cpu(dent_next->de_hash);
- off_next = gfs2_disk_hash2offset(off_next);
+ off_next = dent_next->de_cookie;
if (off < ctx->pos)
continue;
@@ -1290,6 +1334,40 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
return ptr;
}
+
+static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ unsigned leaf_nr, struct gfs2_dirent **darr,
+ unsigned entries)
+{
+ int sort_id = -1;
+ int i;
+
+ for (i = 0; i < entries; i++) {
+ unsigned offset;
+
+ darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash);
+ darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie);
+
+ if (!sdp->sd_args.ar_loccookie)
+ continue;
+ offset = (char *)(darr[i]) -
+ (bh->b_data + gfs2_dirent_offset(bh->b_data));
+ offset /= GFS2_MIN_DIRENT_SIZE;
+ offset += leaf_nr * sdp->sd_max_dents_per_leaf;
+ if (offset >= GFS2_USE_HASH_FLAG ||
+ leaf_nr >= GFS2_USE_HASH_FLAG) {
+ darr[i]->de_cookie |= GFS2_USE_HASH_FLAG;
+ if (sort_id < 0)
+ sort_id = i;
+ continue;
+ }
+ darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK;
+ darr[i]->de_cookie |= offset;
+ }
+ return sort_id;
+}
+
+
static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
int *copied, unsigned *depth,
u64 leaf_no)
@@ -1299,12 +1377,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
struct buffer_head *bh;
struct gfs2_leaf *lf;
unsigned entries = 0, entries2 = 0;
- unsigned leaves = 0;
- const struct gfs2_dirent **darr, *dent;
+ unsigned leaves = 0, leaf = 0, offset, sort_offset;
+ struct gfs2_dirent **darr, *dent;
struct dirent_gather g;
struct buffer_head **larr;
- int leaf = 0;
- int error, i;
+ int error, i, need_sort = 0, sort_id;
u64 lfn = leaf_no;
do {
@@ -1320,6 +1397,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
brelse(bh);
} while(lfn);
+ if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) {
+ need_sort = 1;
+ sort_offset = 0;
+ }
+
if (!entries)
return 0;
@@ -1333,8 +1415,8 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
if (!larr)
goto out;
- darr = (const struct gfs2_dirent **)(larr + leaves);
- g.pdent = darr;
+ darr = (struct gfs2_dirent **)(larr + leaves);
+ g.pdent = (const struct gfs2_dirent **)darr;
g.offset = 0;
lfn = leaf_no;
@@ -1345,6 +1427,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
lf = (struct gfs2_leaf *)bh->b_data;
lfn = be64_to_cpu(lf->lf_next);
if (lf->lf_entries) {
+ offset = g.offset;
entries2 += be16_to_cpu(lf->lf_entries);
dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
gfs2_dirent_gather, NULL, &g);
@@ -1362,17 +1445,26 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
goto out_free;
}
error = 0;
+ sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset],
+ be16_to_cpu(lf->lf_entries));
+ if (!need_sort && sort_id >= 0) {
+ need_sort = 1;
+ sort_offset = offset + sort_id;
+ }
larr[leaf++] = bh;
} else {
+ larr[leaf++] = NULL;
brelse(bh);
}
} while(lfn);
BUG_ON(entries2 != entries);
- error = do_filldir_main(ip, ctx, darr, entries, copied);
+ error = do_filldir_main(ip, ctx, darr, entries, need_sort ?
+ sort_offset : entries, copied);
out_free:
for(i = 0; i < leaf; i++)
- brelse(larr[i]);
+ if (larr[i])
+ brelse(larr[i]);
kvfree(larr);
out:
return error;
@@ -1478,7 +1570,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
struct gfs2_inode *dip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct dirent_gather g;
- const struct gfs2_dirent **darr, *dent;
+ struct gfs2_dirent **darr, *dent;
struct buffer_head *dibh;
int copied = 0;
int error;
@@ -1502,7 +1594,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
/* 96 is max number of dirents which can be stuffed into an inode */
darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
if (darr) {
- g.pdent = darr;
+ g.pdent = (const struct gfs2_dirent **)darr;
g.offset = 0;
dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
gfs2_dirent_gather, NULL, &g);
@@ -1519,8 +1611,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
error = -EIO;
goto out;
}
+ gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries);
error = do_filldir_main(dip, ctx, darr,
- dip->i_entries, &copied);
+ dip->i_entries, 0, &copied);
out:
kfree(darr);
}
@@ -1555,15 +1648,22 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
if (dent) {
+ struct inode *inode;
+ u16 rahead;
+
if (IS_ERR(dent))
return ERR_CAST(dent);
dtype = be16_to_cpu(dent->de_type);
+ rahead = be16_to_cpu(dent->de_rahead);
addr = be64_to_cpu(dent->de_inum.no_addr);
formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
brelse(bh);
if (fail_on_exist)
return ERR_PTR(-EEXIST);
- return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+ inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino);
+ if (!IS_ERR(inode))
+ GFS2_I(inode)->i_rahead = rahead;
+ return inode;
}
return ERR_PTR(-ENOENT);
}
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 5d15e9498b48..d5bda8513457 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -137,7 +137,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
struct gfs2_sbd *sdp = sb->s_fs_info;
struct inode *inode;
- inode = gfs2_ilookup(sb, inum->no_addr, 0);
+ inode = gfs2_ilookup(sb, inum->no_addr);
if (inode) {
if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
iput(inode);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 9287a2d17b8c..208efc70ad49 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -298,9 +298,9 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
gfsflags &= ~GFS2_DIF_TOPDIR;
if (gfsflags & GFS2_DIF_INHERIT_JDATA)
gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
- return do_gfs2_set_flags(filp, gfsflags, ~0);
+ return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM);
}
- return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
+ return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA));
}
static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -336,8 +336,8 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
int hint = min_t(size_t, INT_MAX, blks);
- if (hint > atomic_read(&ip->i_res->rs_sizehint))
- atomic_set(&ip->i_res->rs_sizehint, hint);
+ if (hint > atomic_read(&ip->i_res.rs_sizehint))
+ atomic_set(&ip->i_res.rs_sizehint, hint);
}
/**
@@ -354,8 +354,8 @@ static int gfs2_allocate_page_backing(struct page *page)
{
struct inode *inode = page->mapping->host;
struct buffer_head bh;
- unsigned long size = PAGE_CACHE_SIZE;
- u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ unsigned long size = PAGE_SIZE;
+ u64 lblock = page->index << (PAGE_SHIFT - inode->i_blkbits);
do {
bh.b_state = 0;
@@ -386,7 +386,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned long last_index;
- u64 pos = page->index << PAGE_CACHE_SHIFT;
+ u64 pos = page->index << PAGE_SHIFT;
unsigned int data_blocks, ind_blocks, rblocks;
struct gfs2_holder gh;
loff_t size;
@@ -397,15 +397,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
/* Update file times before taking page lock */
file_update_time(vma->vm_file);
- ret = get_write_access(inode);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
goto out;
- ret = gfs2_rs_alloc(ip);
- if (ret)
- goto out_write_access;
-
- gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
+ gfs2_size_hint(vma->vm_file, pos, PAGE_SIZE);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
@@ -415,7 +411,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
set_bit(GIF_SW_PAGED, &ip->i_flags);
- if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
+ if (!gfs2_write_alloc_required(ip, pos, PAGE_SIZE)) {
lock_page(page);
if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
ret = -EAGAIN;
@@ -428,7 +424,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret)
goto out_unlock;
- gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+ gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks);
ap.target = data_blocks + ind_blocks;
ret = gfs2_quota_lock_check(ip, &ap);
if (ret)
@@ -451,7 +447,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
lock_page(page);
ret = -EINVAL;
size = i_size_read(inode);
- last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (size - 1) >> PAGE_SHIFT;
/* Check page index against inode size */
if (size == 0 || (page->index > last_index))
goto out_trans_end;
@@ -486,8 +482,6 @@ out_uninit:
set_page_dirty(page);
wait_for_stable_page(page);
}
-out_write_access:
- put_write_access(inode);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
@@ -623,7 +617,7 @@ static int gfs2_release(struct inode *inode, struct file *file)
if (!(file->f_mode & FMODE_WRITE))
return 0;
- gfs2_rs_delete(ip, &inode->i_writecount);
+ gfs2_rsqa_delete(ip, &inode->i_writecount);
return 0;
}
@@ -703,7 +697,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct gfs2_inode *ip = GFS2_I(file_inode(file));
int ret;
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
return ret;
@@ -879,7 +873,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
rblocks += data_blocks ? data_blocks : 1;
error = gfs2_trans_begin(sdp, rblocks,
- PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+ PAGE_SIZE/sdp->sd_sb.sb_bsize);
if (error)
goto out_trans_fail;
@@ -897,8 +891,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) {
i_size_write(inode, pos + count);
- /* Marks the inode as dirty */
file_update_time(file);
+ mark_inode_dirty(inode);
}
return generic_write_sync(file, pos, count);
@@ -920,7 +914,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
return -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
@@ -938,20 +932,21 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
if (ret)
goto out_unlock;
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret)
goto out_putw;
ret = __gfs2_fallocate(file, mode, offset, len);
if (ret)
- gfs2_rs_deltree(ip->i_res);
+ gfs2_rs_deltree(&ip->i_res);
+
out_putw:
put_write_access(inode);
out_unlock:
gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -962,7 +957,7 @@ static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
int error;
struct gfs2_inode *ip = GFS2_I(out->f_mapping->host);
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
return (ssize_t)error;
@@ -1018,7 +1013,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
struct gfs2_inode *ip = GFS2_I(file_inode(file));
struct gfs2_glock *gl;
unsigned int state;
- int flags;
+ u16 flags;
int error = 0;
int sleeptime;
@@ -1032,7 +1027,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (fl_gh->gh_state == state)
goto out;
locks_lock_file_wait(file,
- &(struct file_lock){.fl_type = F_UNLCK});
+ &(struct file_lock) {
+ .fl_type = F_UNLCK,
+ .fl_flags = FL_FLOCK
+ });
gfs2_glock_dq(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
} else {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9bd1244caf38..6539131c52a2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -246,8 +246,8 @@ static inline void do_error(struct gfs2_glock *gl, const int ret)
*/
static int do_promote(struct gfs2_glock *gl)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh, *tmp;
@@ -260,10 +260,10 @@ restart:
if (may_grant(gl, gh)) {
if (gh->gh_list.prev == &gl->gl_holders &&
glops->go_lock) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
/* FIXME: eliminate this eventually */
ret = glops->go_lock(gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (ret) {
if (ret == 1)
return 2;
@@ -361,7 +361,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
unsigned state = ret & LM_OUT_ST_MASK;
int rv;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
trace_gfs2_glock_state_change(gl, state);
state_change(gl, state);
gh = find_first_waiter(gl);
@@ -405,7 +405,7 @@ retry:
pr_err("wanted %u got %u\n", gl->gl_target, state);
GLOCK_BUG_ON(gl, 1);
}
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return;
}
@@ -414,9 +414,9 @@ retry:
gfs2_demote_wake(gl);
if (state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
rv = glops->go_xmote_bh(gl, gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (rv) {
do_error(gl, rv);
goto out;
@@ -429,7 +429,7 @@ retry:
out:
clear_bit(GLF_LOCK, &gl->gl_flags);
out_locked:
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
/**
@@ -441,12 +441,12 @@ out_locked:
*/
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- unsigned int lck_flags = gh ? gh->gh_flags : 0;
+ unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
int ret;
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
@@ -464,7 +464,7 @@ __acquires(&gl->gl_spin)
(gl->gl_state == LM_ST_EXCLUSIVE) ||
(lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
clear_bit(GLF_BLOCKING, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (glops->go_sync)
glops->go_sync(gl);
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
@@ -485,7 +485,7 @@ __acquires(&gl->gl_spin)
gfs2_glock_put(gl);
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
}
/**
@@ -513,8 +513,8 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
*/
static void run_queue(struct gfs2_glock *gl, const int nonblock)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
struct gfs2_holder *gh = NULL;
int ret;
@@ -572,17 +572,24 @@ static void delete_work_func(struct work_struct *work)
struct inode *inode;
u64 no_addr = gl->gl_name.ln_number;
+ /* If someone's using this glock to create a new dinode, the block must
+ have been freed by another node, then re-used, in which case our
+ iopen callback is too late after the fact. Ignore it. */
+ if (test_bit(GLF_INODE_CREATING, &gl->gl_flags))
+ goto out;
+
ip = gl->gl_object;
/* Note: Unsafe to dereference ip as we don't hold right refs/locks */
if (ip)
- inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1);
+ inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
else
inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
if (inode && !IS_ERR(inode)) {
d_prune_aliases(inode);
iput(inode);
}
+out:
gfs2_glock_put(gl);
}
@@ -596,7 +603,7 @@ static void glock_work_func(struct work_struct *work)
finish_xmote(gl, gl->gl_reply);
drop_ref = 1;
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
gl->gl_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != LM_ST_EXCLUSIVE) {
@@ -612,7 +619,7 @@ static void glock_work_func(struct work_struct *work)
}
}
run_queue(gl, 0);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!delay)
gfs2_glock_put(gl);
else {
@@ -750,7 +757,7 @@ again:
*
*/
-void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
struct gfs2_holder *gh)
{
INIT_LIST_HEAD(&gh->gh_list);
@@ -774,7 +781,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
*
*/
-void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh)
{
gh->gh_state = state;
gh->gh_flags = flags;
@@ -876,8 +883,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
*/
static inline void add_to_queue(struct gfs2_holder *gh)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -926,10 +933,10 @@ fail:
do_cancel:
gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (sdp->sd_lockstruct.ls_ops->lm_cancel)
sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
}
return;
@@ -967,7 +974,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
if (test_bit(GLF_LRU, &gl->gl_flags))
gfs2_glock_remove_from_lru(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
add_to_queue(gh);
if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
@@ -977,7 +984,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
gl->gl_lockref.count--;
}
run_queue(gl, 1);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!(gh->gh_flags & GL_ASYNC))
error = gfs2_glock_wait(gh);
@@ -1010,17 +1017,18 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
unsigned delay = 0;
int fast_path = 0;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
list_del_init(&gh->gh_list);
+ clear_bit(HIF_HOLDER, &gh->gh_iflags);
if (find_first_holder(gl) == NULL) {
if (glops->go_unlock) {
GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
glops->go_unlock(gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
clear_bit(GLF_LOCK, &gl->gl_flags);
}
if (list_empty(&gl->gl_holders) &&
@@ -1033,7 +1041,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
gfs2_glock_add_to_lru(gl);
trace_gfs2_glock_queue(gh, 0);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (likely(fast_path))
return;
@@ -1080,7 +1088,7 @@ void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
- unsigned int state, int flags, struct gfs2_holder *gh)
+ unsigned int state, u16 flags, struct gfs2_holder *gh)
{
struct gfs2_glock *gl;
int error;
@@ -1217,9 +1225,9 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
delay = gl->gl_hold_time;
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
handle_callback(gl, state, delay, true);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
gfs2_glock_put(gl);
}
@@ -1259,7 +1267,7 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
* @gl: Pointer to the glock
* @ret: The return value from the dlm
*
- * The gl_reply field is under the gl_spin lock so that it is ok
+ * The gl_reply field is under the gl_lockref.lock lock so that it is ok
* to use a bitfield shared with other glock state fields.
*/
@@ -1267,20 +1275,20 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return;
}
}
gl->gl_lockref.count++;
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
@@ -1326,14 +1334,14 @@ __acquires(&lru_lock)
while(!list_empty(list)) {
gl = list_entry(list->next, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
- if (!spin_trylock(&gl->gl_spin)) {
+ if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
list_add(&gl->gl_lru, &lru_list);
atomic_inc(&lru_count);
continue;
}
if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
goto add_back_to_lru;
}
clear_bit(GLF_LRU, &gl->gl_flags);
@@ -1343,7 +1351,7 @@ add_back_to_lru:
WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gl->gl_lockref.count--;
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
cond_resched_lock(&lru_lock);
}
}
@@ -1417,14 +1425,14 @@ static struct shrinker glock_shrinker = {
static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
{
struct gfs2_glock *gl;
- struct rhash_head *pos, *next;
+ struct rhash_head *pos;
const struct bucket_table *tbl;
int i;
rcu_read_lock();
tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table);
for (i = 0; i < tbl->size; i++) {
- rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) {
+ rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) {
if ((gl->gl_name.ln_sbd == sdp) &&
lockref_get_not_dead(&gl->gl_lockref))
examiner(gl);
@@ -1461,10 +1469,10 @@ static void clear_glock(struct gfs2_glock *gl)
{
gfs2_glock_remove_from_lru(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (gl->gl_state != LM_ST_UNLOCKED)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
}
@@ -1482,9 +1490,9 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
{
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gfs2_dump_glock(seq, gl);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
static void dump_glock_func(struct gfs2_glock *gl)
@@ -1506,7 +1514,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
flush_workqueue(glock_workqueue);
glock_hash_walk(clear_glock, sdp);
flush_workqueue(glock_workqueue);
- wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
+ wait_event_timeout(sdp->sd_glock_wait,
+ atomic_read(&sdp->sd_glock_disposal) == 0,
+ HZ * 600);
glock_hash_walk(dump_glock_func, sdp);
}
@@ -1518,10 +1528,10 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
ret = gfs2_truncatei_resume(ip);
gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
clear_bit(GLF_LOCK, &gl->gl_flags);
run_queue(gl, 1);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
static const char *state2str(unsigned state)
@@ -1539,7 +1549,7 @@ static const char *state2str(unsigned state)
return "??";
}
-static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
{
char *p = buf;
if (flags & LM_FLAG_TRY)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 32572f71f027..46ab67fc16da 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -79,15 +79,15 @@ enum {
* requested had acquired and released the lock.
*/
-#define LM_FLAG_TRY 0x00000001
-#define LM_FLAG_TRY_1CB 0x00000002
-#define LM_FLAG_NOEXP 0x00000004
-#define LM_FLAG_ANY 0x00000008
-#define LM_FLAG_PRIORITY 0x00000010
-#define GL_ASYNC 0x00000040
-#define GL_EXACT 0x00000080
-#define GL_SKIP 0x00000100
-#define GL_NOCACHE 0x00000400
+#define LM_FLAG_TRY 0x0001
+#define LM_FLAG_TRY_1CB 0x0002
+#define LM_FLAG_NOEXP 0x0004
+#define LM_FLAG_ANY 0x0008
+#define LM_FLAG_PRIORITY 0x0010
+#define GL_ASYNC 0x0040
+#define GL_EXACT 0x0080
+#define GL_SKIP 0x0100
+#define GL_NOCACHE 0x0400
/*
* lm_async_cb return flags
@@ -141,7 +141,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
struct pid *pid;
/* Look in glock's list of holders for one with current task as owner */
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
pid = task_pid(current);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
@@ -151,7 +151,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
}
gh = NULL;
out:
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return gh;
}
@@ -183,8 +183,8 @@ extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
int create, struct gfs2_glock **glp);
extern void gfs2_glock_put(struct gfs2_glock *gl);
extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
- unsigned flags, struct gfs2_holder *gh);
-extern void gfs2_holder_reinit(unsigned int state, unsigned flags,
+ u16 flags, struct gfs2_holder *gh);
+extern void gfs2_holder_reinit(unsigned int state, u16 flags,
struct gfs2_holder *gh);
extern void gfs2_holder_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq(struct gfs2_holder *gh);
@@ -195,7 +195,7 @@ extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
- unsigned int state, int flags,
+ unsigned int state, u16 flags,
struct gfs2_holder *gh);
extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
@@ -215,7 +215,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
*/
static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
- unsigned int state, int flags,
+ unsigned int state, u16 flags,
struct gfs2_holder *gh)
{
int error;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 1f6c9c3fe5cb..437fd73e381e 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/bio.h>
#include <linux/posix_acl.h>
+#include <linux/security.h>
#include "gfs2.h"
#include "incore.h"
@@ -146,11 +147,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
struct gfs2_rgrpd *rgd;
int error;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
rgd = gl->gl_object;
if (rgd)
gfs2_rgrp_brelse(rgd);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
return;
@@ -162,11 +163,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
mapping_set_error(mapping, error);
gfs2_ail_empty_gl(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
rgd = gl->gl_object;
if (rgd)
gfs2_free_clones(rgd);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
/**
@@ -262,6 +263,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
if (ip) {
set_bit(GIF_INVALID, &ip->i_flags);
forget_all_cached_acls(&ip->i_inode);
+ security_inode_invalidate_secctx(&ip->i_inode);
gfs2_dir_hash_inval(ip);
}
}
@@ -542,7 +544,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl)
* iopen_go_callback - schedule the dcache entry for the inode to be deleted
* @gl: the glock
*
- * gl_spin lock is held while calling this
+ * gl_lockref.lock lock is held while calling this
*/
static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
{
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 121ed08d9d9f..a6a3389a07fc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -259,8 +259,8 @@ struct gfs2_holder {
struct gfs2_glock *gh_gl;
struct pid *gh_owner_pid;
- unsigned int gh_state;
- unsigned gh_flags;
+ u16 gh_flags;
+ u16 gh_state;
int gh_error;
unsigned long gh_iflags; /* HIF_... */
@@ -270,6 +270,13 @@ struct gfs2_holder {
/* Number of quota types we support */
#define GFS2_MAXQUOTAS 2
+struct gfs2_qadata { /* quota allocation data */
+ /* Quota stuff */
+ struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS];
+ struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS];
+ unsigned int qa_qd_num;
+};
+
/* Resource group multi-block reservation, in order of appearance:
Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -288,11 +295,6 @@ struct gfs2_blkreserv {
struct gfs2_rbm rs_rbm; /* Start of reservation */
u32 rs_free; /* how many blocks are still free */
u64 rs_inum; /* Inode number for reservation */
-
- /* ancillary quota stuff */
- struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
- struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
- unsigned int rs_qa_qd_num;
};
/*
@@ -326,6 +328,7 @@ enum {
GLF_LRU = 13,
GLF_OBJECT = 14, /* Used only for tracing */
GLF_BLOCKING = 15,
+ GLF_INODE_CREATING = 16, /* Inode creation occurring */
};
struct gfs2_glock {
@@ -334,9 +337,8 @@ struct gfs2_glock {
struct lm_lockname gl_name;
struct lockref gl_lockref;
-#define gl_spin gl_lockref.lock
- /* State fields protected by gl_spin */
+ /* State fields protected by gl_lockref.lock */
unsigned int gl_state:2, /* Current state */
gl_target:2, /* Target state */
gl_demote_state:2, /* State requested by remote node */
@@ -392,7 +394,8 @@ struct gfs2_inode {
struct gfs2_glock *i_gl; /* Move into i_gh? */
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
- struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
+ struct gfs2_qadata *i_qadata; /* quota allocation data */
+ struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
struct gfs2_rgrpd *i_rgd;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
@@ -403,6 +406,7 @@ struct gfs2_inode {
u32 i_diskflags;
u8 i_height;
u8 i_depth;
+ u16 i_rahead;
};
/*
@@ -559,6 +563,8 @@ struct gfs2_args {
unsigned int ar_errors:2; /* errors=withdraw | panic */
unsigned int ar_nobarrier:1; /* do not send barriers */
unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */
+ unsigned int ar_loccookie:1; /* use location based readdir
+ cookies */
int ar_commit; /* Commit interval */
int ar_statfs_quantum; /* The fast statfs interval */
int ar_quota_quantum; /* The quota interval */
@@ -686,6 +692,7 @@ struct gfs2_sbd {
u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
u32 sd_max_jheight; /* Max height of journaled file's meta tree */
u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
+ u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
struct gfs2_args sd_args; /* Mount arguments */
struct gfs2_tune sd_tune; /* Filesystem tuning structure */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 063fdfcf8275..bb30f9a72c65 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -37,61 +37,9 @@
#include "super.h"
#include "glops.h"
-struct gfs2_skip_data {
- u64 no_addr;
- int skipped;
- int non_block;
-};
-
-static int iget_test(struct inode *inode, void *opaque)
+struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_skip_data *data = opaque;
-
- if (ip->i_no_addr == data->no_addr) {
- if (data->non_block &&
- inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
- data->skipped = 1;
- return 0;
- }
- return 1;
- }
- return 0;
-}
-
-static int iget_set(struct inode *inode, void *opaque)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_skip_data *data = opaque;
-
- if (data->skipped)
- return -ENOENT;
- inode->i_ino = (unsigned long)(data->no_addr);
- ip->i_no_addr = data->no_addr;
- return 0;
-}
-
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block)
-{
- unsigned long hash = (unsigned long)no_addr;
- struct gfs2_skip_data data;
-
- data.no_addr = no_addr;
- data.skipped = 0;
- data.non_block = non_block;
- return ilookup5(sb, hash, iget_test, &data);
-}
-
-static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr,
- int non_block)
-{
- struct gfs2_skip_data data;
- unsigned long hash = (unsigned long)no_addr;
-
- data.no_addr = no_addr;
- data.skipped = 0;
- data.non_block = non_block;
- return iget5_locked(sb, hash, iget_test, iget_set, &data);
+ return ilookup(sb, (unsigned long)no_addr);
}
/**
@@ -132,21 +80,21 @@ static void gfs2_set_iop(struct inode *inode)
* @sb: The super block
* @no_addr: The inode number
* @type: The type of the inode
- * non_block: Can we block on inodes that are being freed?
*
* Returns: A VFS inode, or an error
*/
struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
- u64 no_addr, u64 no_formal_ino, int non_block)
+ u64 no_addr, u64 no_formal_ino)
{
struct inode *inode;
struct gfs2_inode *ip;
struct gfs2_glock *io_gl = NULL;
int error;
- inode = gfs2_iget(sb, no_addr, non_block);
+ inode = iget_locked(sb, (unsigned long)no_addr);
ip = GFS2_I(inode);
+ ip->i_no_addr = no_addr;
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -191,13 +139,13 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
fail_refresh:
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
ip->i_iopen_gh.gh_gl->gl_object = NULL;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
fail_iopen:
if (io_gl)
gfs2_glock_put(io_gl);
fail_put:
ip->i_gl->gl_object = NULL;
- gfs2_glock_put(ip->i_gl);
fail:
iget_failed(inode);
return ERR_PTR(error);
@@ -221,7 +169,7 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
if (error)
goto fail;
- inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, 1);
+ inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0);
if (IS_ERR(inode))
goto fail;
@@ -592,8 +540,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- struct gfs2_glock *io_gl;
- int error, free_vfs_inode = 0;
+ struct gfs2_glock *io_gl = NULL;
+ int error, free_vfs_inode = 1;
u32 aflags = 0;
unsigned blocks = 1;
struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
@@ -601,7 +549,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
- error = gfs2_rs_alloc(dip);
+ error = gfs2_rsqa_alloc(dip);
if (error)
return error;
@@ -650,10 +598,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = posix_acl_create(dir, &mode, &default_acl, &acl);
if (error)
- goto fail_free_vfs_inode;
+ goto fail_gunlock;
ip = GFS2_I(inode);
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
goto fail_free_acls;
@@ -685,6 +633,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
ip->i_entries = 2;
break;
}
+
+ /* Force SYSTEM flag on all files and subdirs of a SYSTEM directory */
+ if (dip->i_diskflags & GFS2_DIF_SYSTEM)
+ ip->i_diskflags |= GFS2_DIF_SYSTEM;
+
gfs2_set_inode_flags(inode);
if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) ||
@@ -724,6 +677,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_gunlock2;
+ BUG_ON(test_and_set_bit(GLF_INODE_CREATING, &io_gl->gl_flags));
+
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
if (error)
goto fail_gunlock2;
@@ -733,6 +688,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
gfs2_set_iop(inode);
insert_inode_hash(inode);
+ free_vfs_inode = 0; /* After this point, the inode is no longer
+ considered free. Any failures need to undo
+ the gfs2 structures. */
if (default_acl) {
error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
@@ -763,27 +721,25 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
}
gfs2_glock_dq_uninit(ghs);
gfs2_glock_dq_uninit(ghs + 1);
+ clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
return error;
fail_gunlock3:
- gfs2_glock_dq_uninit(ghs + 1);
- if (ip->i_gl)
- gfs2_glock_put(ip->i_gl);
- goto fail_gunlock;
-
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_put(io_gl);
fail_gunlock2:
+ if (io_gl)
+ clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags);
gfs2_glock_dq_uninit(ghs + 1);
fail_free_inode:
if (ip->i_gl)
gfs2_glock_put(ip->i_gl);
- gfs2_rs_delete(ip, NULL);
+ gfs2_rsqa_delete(ip, NULL);
fail_free_acls:
if (default_acl)
posix_acl_release(default_acl);
if (acl)
posix_acl_release(acl);
-fail_free_vfs_inode:
- free_vfs_inode = 1;
fail_gunlock:
gfs2_dir_no_add(&da);
gfs2_glock_dq_uninit(ghs);
@@ -898,7 +854,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (S_ISDIR(inode->i_mode))
return -EPERM;
- error = gfs2_rs_alloc(dip);
+ error = gfs2_rsqa_alloc(dip);
if (error)
return error;
@@ -1371,7 +1327,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (error)
return error;
- error = gfs2_rs_alloc(ndip);
+ error = gfs2_rsqa_alloc(ndip);
if (error)
return error;
@@ -1712,24 +1668,30 @@ static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
}
/**
- * gfs2_follow_link - Follow a symbolic link
+ * gfs2_get_link - Follow a symbolic link
* @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
+ * @inode: The inode of the link
+ * @done: destructor for return value
*
* This can handle symlinks of any size.
*
* Returns: 0 on success or error code
*/
-static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
+static const char *gfs2_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+ struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder i_gh;
struct buffer_head *dibh;
unsigned int size;
char *buf;
int error;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
error = gfs2_glock_nq(&i_gh);
if (error) {
@@ -1759,7 +1721,7 @@ static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
out:
gfs2_glock_dq_uninit(&i_gh);
if (!IS_ERR(buf))
- *cookie = buf;
+ set_delayed_call(done, kfree_link, buf);
return buf;
}
@@ -1854,11 +1816,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
ogid = ngid = NO_GID_QUOTA_CHANGE;
- error = get_write_access(inode);
- if (error)
- return error;
-
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
goto out;
@@ -1898,7 +1856,6 @@ out_end_trans:
out_gunlock_q:
gfs2_quota_unlock(ip);
out:
- put_write_access(inode);
return error;
}
@@ -1920,7 +1877,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
struct gfs2_holder i_gh;
int error;
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
return error;
@@ -2002,7 +1959,7 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret == 0) {
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret == 0)
ret = generic_setxattr(dentry, name, data, size, flags);
gfs2_glock_dq(&gh);
@@ -2043,7 +2000,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret == 0) {
- ret = gfs2_rs_alloc(ip);
+ ret = gfs2_rsqa_alloc(ip);
if (ret == 0)
ret = generic_removexattr(dentry, name);
gfs2_glock_dq(&gh);
@@ -2063,7 +2020,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
if (ret)
@@ -2090,7 +2047,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
gfs2_glock_dq_uninit(&gh);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -2132,8 +2089,7 @@ const struct inode_operations gfs2_dir_iops = {
const struct inode_operations gfs2_symlink_iops = {
.readlink = generic_readlink,
- .follow_link = gfs2_follow_link,
- .put_link = kfree_put_link,
+ .get_link = gfs2_get_link,
.permission = gfs2_permission,
.setattr = gfs2_setattr,
.getattr = gfs2_getattr,
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index ba4d9492d422..e1af0d4aa308 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -94,12 +94,11 @@ err:
}
extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
- u64 no_addr, u64 no_formal_ino,
- int non_block);
+ u64 no_addr, u64 no_formal_ino);
extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
u64 *no_formal_ino,
unsigned int blktype);
-extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 284c1542783e..8b907c5cc913 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -50,7 +50,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
s64 delta = sample - s->stats[index];
s->stats[index] += (delta >> 3);
index++;
- s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2);
+ s->stats[index] += ((abs(delta) - s->stats[index]) >> 2);
}
/**
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 536e7a6252cd..0ff028c15199 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -716,6 +716,9 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
}
trace_gfs2_log_flush(sdp, 1);
+ if (type == SHUTDOWN_FLUSH)
+ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
sdp->sd_log_flush_head = sdp->sd_log_head;
sdp->sd_log_flush_wrapped = 0;
tr = sdp->sd_log_tr;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 241a399bf83d..f99f8e94de3f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -41,7 +41,9 @@ static void gfs2_init_inode_once(void *foo)
inode_init_once(&ip->i_inode);
init_rwsem(&ip->i_rw_mutex);
INIT_LIST_HEAD(&ip->i_trunc_list);
- ip->i_res = NULL;
+ ip->i_qadata = NULL;
+ memset(&ip->i_res, 0, sizeof(ip->i_res));
+ RB_CLEAR_NODE(&ip->i_res.rs_node);
ip->i_hash_cache = NULL;
}
@@ -50,7 +52,7 @@ static void gfs2_init_glock_once(void *foo)
struct gfs2_glock *gl = foo;
INIT_HLIST_BL_NODE(&gl->gl_list);
- spin_lock_init(&gl->gl_spin);
+ spin_lock_init(&gl->gl_lockref.lock);
INIT_LIST_HEAD(&gl->gl_holders);
INIT_LIST_HEAD(&gl->gl_lru);
INIT_LIST_HEAD(&gl->gl_ail_list);
@@ -112,7 +114,8 @@ static int __init init_gfs2_fs(void)
gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
sizeof(struct gfs2_inode),
0, SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT,
gfs2_init_inode_once);
if (!gfs2_inode_cachep)
goto fail;
@@ -135,10 +138,10 @@ static int __init init_gfs2_fs(void)
if (!gfs2_quotad_cachep)
goto fail;
- gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk",
- sizeof(struct gfs2_blkreserv),
+ gfs2_qadata_cachep = kmem_cache_create("gfs2_qadata",
+ sizeof(struct gfs2_qadata),
0, 0, NULL);
- if (!gfs2_rsrv_cachep)
+ if (!gfs2_qadata_cachep)
goto fail;
register_shrinker(&gfs2_qd_shrinker);
@@ -193,8 +196,8 @@ fail_lru:
unregister_shrinker(&gfs2_qd_shrinker);
gfs2_glock_exit();
- if (gfs2_rsrv_cachep)
- kmem_cache_destroy(gfs2_rsrv_cachep);
+ if (gfs2_qadata_cachep)
+ kmem_cache_destroy(gfs2_qadata_cachep);
if (gfs2_quotad_cachep)
kmem_cache_destroy(gfs2_quotad_cachep);
@@ -238,7 +241,7 @@ static void __exit exit_gfs2_fs(void)
rcu_barrier();
mempool_destroy(gfs2_page_pool);
- kmem_cache_destroy(gfs2_rsrv_cachep);
+ kmem_cache_destroy(gfs2_qadata_cachep);
kmem_cache_destroy(gfs2_quotad_cachep);
kmem_cache_destroy(gfs2_rgrpd_cachep);
kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0e1d4be5865a..0448524c11bc 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -124,7 +124,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
if (mapping == NULL)
mapping = &sdp->sd_aspace;
- shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+ shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
index = blkno >> shift; /* convert block to page */
bufnum = blkno - (index << shift); /* block buf index within page */
@@ -154,7 +154,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
map_bh(bh, sdp->sd_vfs, blkno);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return bh;
}
@@ -187,6 +187,52 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
return bh;
}
+static void gfs2_meta_read_endio(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+ struct buffer_head *bh = page_buffers(page);
+ unsigned int len = bvec->bv_len;
+
+ while (bh_offset(bh) < bvec->bv_offset)
+ bh = bh->b_this_page;
+ do {
+ struct buffer_head *next = bh->b_this_page;
+ len -= bh->b_size;
+ bh->b_end_io(bh, !bio->bi_error);
+ bh = next;
+ } while (bh && len);
+ }
+ bio_put(bio);
+}
+
+/*
+ * Submit several consecutive buffer head I/O requests as a single bio I/O
+ * request. (See submit_bh_wbc.)
+ */
+static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
+{
+ struct buffer_head *bh = bhs[0];
+ struct bio *bio;
+ int i;
+
+ if (!num)
+ return;
+
+ bio = bio_alloc(GFP_NOIO, num);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_bdev = bh->b_bdev;
+ for (i = 0; i < num; i++) {
+ bh = bhs[i];
+ bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+ }
+ bio->bi_end_io = gfs2_meta_read_endio;
+ submit_bio(rw, bio);
+}
+
/**
* gfs2_meta_read - Read a block from disk
* @gl: The glock covering the block
@@ -198,10 +244,11 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
*/
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
- struct buffer_head **bhp)
+ int rahead, struct buffer_head **bhp)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct buffer_head *bh;
+ struct buffer_head *bh, *bhs[2];
+ int num = 0;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
*bhp = NULL;
@@ -213,14 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
lock_buffer(bh);
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
- return 0;
+ flags &= ~DIO_WAIT;
+ } else {
+ bh->b_end_io = end_buffer_read_sync;
+ get_bh(bh);
+ bhs[num++] = bh;
}
- bh->b_end_io = end_buffer_read_sync;
- get_bh(bh);
- submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
+
+ if (rahead) {
+ bh = gfs2_getbuf(gl, blkno + 1, CREATE);
+
+ lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ brelse(bh);
+ } else {
+ bh->b_end_io = end_buffer_read_sync;
+ bhs[num++] = bh;
+ }
+ }
+
+ gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
if (!(flags & DIO_WAIT))
return 0;
+ bh = *bhp;
wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh))) {
struct gfs2_trans *tr = current->journal_info;
@@ -341,8 +405,12 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
struct buffer_head *bh;
int ret = 0;
u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+ int rahead = 0;
+
+ if (num == ip->i_no_addr)
+ rahead = ip->i_rahead;
- ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
+ ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh);
if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
brelse(bh);
ret = -EIO;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 8ca161567a93..c5086c8af5ed 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -53,7 +53,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
- struct buffer_head **bhp);
+ int rahead, struct buffer_head **bhp);
extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
int create);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 02586e7eb964..49b0bff18fe3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
sdp->sd_jheightsize[x] = ~0;
gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+ sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_leaf)) /
+ GFS2_MIN_DIRENT_SIZE;
return 0;
}
@@ -451,7 +454,7 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
struct dentry *dentry;
struct inode *inode;
- inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+ inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
if (IS_ERR(inode)) {
fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
return PTR_ERR(inode);
@@ -910,8 +913,7 @@ fail_qc_i:
fail_ut_i:
iput(sdp->sd_sc_inode);
fail:
- if (pn)
- iput(pn);
+ iput(pn);
return error;
}
@@ -1291,6 +1293,9 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
up_write(&s->s_umount);
blkdev_put(bdev, mode);
down_write(&s->s_umount);
+ } else {
+ /* s_mode must be set before deactivate_locked_super calls */
+ s->s_mode = mode;
}
memset(&args, 0, sizeof(args));
@@ -1312,10 +1317,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
if ((flags ^ s->s_flags) & MS_RDONLY)
goto error_super;
} else {
- char b[BDEVNAME_SIZE];
-
- s->s_mode = mode;
- strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
if (error)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3a31226531ea..ce7d69a2fdc0 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -388,7 +388,7 @@ static int bh_get(struct gfs2_quota_data *qd)
error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
if (error)
goto fail;
- error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh);
if (error)
goto fail;
error = -EIO;
@@ -527,37 +527,70 @@ static void qdsb_put(struct gfs2_quota_data *qd)
qd_put(qd);
}
+/**
+ * gfs2_qa_alloc - make sure we have a quota allocations data structure,
+ * if necessary
+ * @ip: the inode for this reservation
+ */
+int gfs2_qa_alloc(struct gfs2_inode *ip)
+{
+ int error = 0;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return 0;
+
+ down_write(&ip->i_rw_mutex);
+ if (ip->i_qadata == NULL) {
+ ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
+ if (!ip->i_qadata)
+ error = -ENOMEM;
+ }
+ up_write(&ip->i_rw_mutex);
+ return error;
+}
+
+void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount)
+{
+ down_write(&ip->i_rw_mutex);
+ if (ip->i_qadata && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
+ kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata);
+ ip->i_qadata = NULL;
+ }
+ up_write(&ip->i_rw_mutex);
+}
+
int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data **qd;
int error;
- if (ip->i_res == NULL) {
- error = gfs2_rs_alloc(ip);
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return 0;
+
+ if (ip->i_qadata == NULL) {
+ error = gfs2_rsqa_alloc(ip);
if (error)
return error;
}
- qd = ip->i_res->rs_qa_qd;
+ qd = ip->i_qadata->qa_qd;
- if (gfs2_assert_warn(sdp, !ip->i_res->rs_qa_qd_num) ||
+ if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) ||
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
return -EIO;
- if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
- return 0;
-
error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) &&
@@ -565,7 +598,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
error = qdsb_get(sdp, make_kqid_uid(uid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
}
@@ -574,7 +607,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
error = qdsb_get(sdp, make_kqid_gid(gid), qd);
if (error)
goto out;
- ip->i_res->rs_qa_qd_num++;
+ ip->i_qadata->qa_qd_num++;
qd++;
}
@@ -587,17 +620,17 @@ out:
void gfs2_quota_unhold(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int x;
+ u32 x;
- if (ip->i_res == NULL)
+ if (ip->i_qadata == NULL)
return;
gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qdsb_put(ip->i_res->rs_qa_qd[x]);
- ip->i_res->rs_qa_qd[x] = NULL;
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qdsb_put(ip->i_qadata->qa_qd[x]);
+ ip->i_qadata->qa_qd[x] = NULL;
}
- ip->i_res->rs_qa_qd_num = 0;
+ ip->i_qadata->qa_qd_num = 0;
}
static int sort_qd(const void *a, const void *b)
@@ -668,7 +701,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
unsigned to_write = bytes, pg_off = off;
int done = 0;
- blk = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift);
+ blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift);
boff = off % bsize;
page = find_or_create_page(mapping, index, GFP_NOFS);
@@ -720,13 +753,13 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
flush_dcache_page(page);
kunmap_atomic(kaddr);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return 0;
unlock_out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return -EIO;
}
@@ -740,13 +773,13 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp,
nbytes = sizeof(struct gfs2_quota);
- pg_beg = loc >> PAGE_CACHE_SHIFT;
- pg_off = loc % PAGE_CACHE_SIZE;
+ pg_beg = loc >> PAGE_SHIFT;
+ pg_off = loc % PAGE_SIZE;
/* If the quota straddles a page boundary, split the write in two */
- if ((pg_off + nbytes) > PAGE_CACHE_SIZE) {
+ if ((pg_off + nbytes) > PAGE_SIZE) {
pg_oflow = 1;
- overflow = (pg_off + nbytes) - PAGE_CACHE_SIZE;
+ overflow = (pg_off + nbytes) - PAGE_SIZE;
}
ptr = qp;
@@ -843,7 +876,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
unsigned int nalloc = 0, blocks;
int error;
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
return error;
@@ -855,7 +888,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
return -ENOMEM;
sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
- mutex_lock(&ip->i_inode.i_mutex);
+ inode_lock(&ip->i_inode);
for (qx = 0; qx < num_qd; qx++) {
error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
GL_NOCACHE, &ghs[qx]);
@@ -920,7 +953,7 @@ out_alloc:
out:
while (qx--)
gfs2_glock_dq_uninit(&ghs[qx]);
- mutex_unlock(&ip->i_inode.i_mutex);
+ inode_unlock(&ip->i_inode);
kfree(ghs);
gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
return error;
@@ -1003,23 +1036,23 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
- unsigned int x;
+ u32 x;
int error = 0;
- error = gfs2_quota_hold(ip, uid, gid);
- if (error)
- return error;
-
if (capable(CAP_SYS_RESOURCE) ||
sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
- sort(ip->i_res->rs_qa_qd, ip->i_res->rs_qa_qd_num,
+ error = gfs2_quota_hold(ip, uid, gid);
+ if (error)
+ return error;
+
+ sort(ip->i_qadata->qa_qd, ip->i_qadata->qa_qd_num,
sizeof(struct gfs2_quota_data *), sort_qd, NULL);
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qd = ip->i_res->rs_qa_qd[x];
- error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]);
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qd = ip->i_qadata->qa_qd[x];
+ error = do_glock(qd, NO_FORCE, &ip->i_qadata->qa_qd_ghs[x]);
if (error)
break;
}
@@ -1028,7 +1061,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
set_bit(GIF_QD_LOCKED, &ip->i_flags);
else {
while (x--)
- gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+ gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]);
gfs2_quota_unhold(ip);
}
@@ -1076,20 +1109,20 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qda[4];
unsigned int count = 0;
- unsigned int x;
+ u32 x;
int found;
if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
goto out;
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
struct gfs2_quota_data *qd;
int sync;
- qd = ip->i_res->rs_qa_qd[x];
+ qd = ip->i_qadata->qa_qd[x];
sync = need_sync(qd);
- gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+ gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]);
if (!sync)
continue;
@@ -1158,7 +1191,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
s64 value, warn, limit;
- unsigned int x;
+ u32 x;
int error = 0;
ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
@@ -1168,8 +1201,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qd = ip->i_res->rs_qa_qd[x];
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qd = ip->i_qadata->qa_qd[x];
if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
qid_eq(qd->qd_id, make_kqid_gid(gid))))
@@ -1216,15 +1249,17 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
kuid_t uid, kgid_t gid)
{
struct gfs2_quota_data *qd;
- unsigned int x;
+ u32 x;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON ||
+ gfs2_assert_warn(sdp, change))
return;
if (ip->i_diskflags & GFS2_DIF_SYSTEM)
return;
- for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
- qd = ip->i_res->rs_qa_qd[x];
+ for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+ qd = ip->i_qadata->qa_qd[x];
if (qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
qid_eq(qd->qd_id, make_kqid_gid(gid))) {
@@ -1635,11 +1670,11 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
if (error)
return error;
- error = gfs2_rs_alloc(ip);
+ error = gfs2_rsqa_alloc(ip);
if (error)
goto out_put;
- mutex_lock(&ip->i_inode.i_mutex);
+ inode_lock(&ip->i_inode);
error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
if (error)
goto out_unlockput;
@@ -1704,7 +1739,7 @@ out_i:
out_q:
gfs2_glock_dq_uninit(&q_gh);
out_unlockput:
- mutex_unlock(&ip->i_inode.i_mutex);
+ inode_unlock(&ip->i_inode);
out_put:
qd_put(qd);
return error;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index ad04b3acae2b..5e47c935a515 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -18,6 +18,8 @@ struct gfs2_sbd;
#define NO_UID_QUOTA_CHANGE INVALID_UID
#define NO_GID_QUOTA_CHANGE INVALID_GID
+extern int gfs2_qa_alloc(struct gfs2_inode *ip);
+extern void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount);
extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
extern void gfs2_quota_unhold(struct gfs2_inode *ip);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 475985d14758..99a0bdac8796 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -596,27 +596,13 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
}
/**
- * gfs2_rs_alloc - make sure we have a reservation assigned to the inode
+ * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
+ * plus a quota allocations data structure, if necessary
* @ip: the inode for this reservation
*/
-int gfs2_rs_alloc(struct gfs2_inode *ip)
+int gfs2_rsqa_alloc(struct gfs2_inode *ip)
{
- int error = 0;
-
- down_write(&ip->i_rw_mutex);
- if (ip->i_res)
- goto out;
-
- ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
- if (!ip->i_res) {
- error = -ENOMEM;
- goto out;
- }
-
- RB_CLEAR_NODE(&ip->i_res->rs_node);
-out:
- up_write(&ip->i_rw_mutex);
- return error;
+ return gfs2_qa_alloc(ip);
}
static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
@@ -678,21 +664,20 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
}
/**
- * gfs2_rs_delete - delete a multi-block reservation
+ * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
* @ip: The inode for this reservation
* @wcount: The inode's write count, or NULL
*
*/
-void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
{
down_write(&ip->i_rw_mutex);
- if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
- gfs2_rs_deltree(ip->i_res);
- BUG_ON(ip->i_res->rs_free);
- kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
- ip->i_res = NULL;
+ if ((wcount == NULL) || (atomic_read(wcount) <= 1)) {
+ gfs2_rs_deltree(&ip->i_res);
+ BUG_ON(ip->i_res.rs_free);
}
up_write(&ip->i_rw_mutex);
+ gfs2_qa_delete(ip, wcount);
}
/**
@@ -729,9 +714,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
rb_erase(n, &sdp->sd_rindex_tree);
if (gl) {
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gl->gl_object = NULL;
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
gfs2_glock_add_to_lru(gl);
gfs2_glock_put(gl);
}
@@ -933,8 +918,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
goto fail;
rgd->rd_gl->gl_object = rgd;
- rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
- rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
+ rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
+ rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -1157,7 +1142,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
for (x = 0; x < length; x++) {
bi = rgd->rd_bits + x;
- error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
+ error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
if (error)
goto fail;
}
@@ -1455,7 +1440,7 @@ static void rs_insert(struct gfs2_inode *ip)
{
struct rb_node **newn, *parent = NULL;
int rc;
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
@@ -1502,7 +1487,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
{
struct gfs2_rbm rbm = { .rgd = rgd, };
u64 goal;
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
u32 extlen;
u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
int ret;
@@ -1573,7 +1558,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
}
if (n) {
- while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) {
+ while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
n = n->rb_right;
if (n == NULL)
@@ -1803,7 +1788,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
continue;
*last_unlinked = block;
- error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl);
+ error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
if (error)
continue;
@@ -1983,7 +1968,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *begin = NULL;
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
int error = 0, rg_locked, flags = 0;
u64 last_unlinked = NO_BLOCK;
int loops = 0;
@@ -2112,7 +2097,7 @@ next_rgrp:
void gfs2_inplace_release(struct gfs2_inode *ip)
{
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
if (rs->rs_rgd_gh.gh_gl)
gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -2266,7 +2251,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
static void gfs2_adjust_reservation(struct gfs2_inode *ip,
const struct gfs2_rbm *rbm, unsigned len)
{
- struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = &ip->i_res;
struct gfs2_rgrpd *rgd = rbm->rgd;
unsigned rlen;
u64 block;
@@ -2309,8 +2294,8 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
{
u64 goal;
- if (gfs2_rs_active(ip->i_res)) {
- *rbm = ip->i_res->rs_rbm;
+ if (gfs2_rs_active(&ip->i_res)) {
+ *rbm = ip->i_res.rs_rbm;
return;
}
@@ -2364,7 +2349,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
gfs2_alloc_extent(&rbm, dinode, nblocks);
block = gfs2_rbm_to_block(&rbm);
rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
- if (gfs2_rs_active(ip->i_res))
+ if (gfs2_rs_active(&ip->i_res))
gfs2_adjust_reservation(ip, &rbm, *nblocks);
ndata = *nblocks;
if (dinode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index c0ab33fa3eed..66b51cf66dfa 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -49,9 +49,9 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip);
extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
bool dinode, u64 *generation);
-extern int gfs2_rs_alloc(struct gfs2_inode *ip);
+extern int gfs2_rsqa_alloc(struct gfs2_inode *ip);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
@@ -78,7 +78,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
extern int gfs2_fitrim(struct file *filp, void __user *argp);
/* This is how to tell if a reservation is in the rgrp tree: */
-static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
+static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
{
return rs && !RB_EMPTY_NODE(&rs->rs_node);
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 894fb01a91da..f8a0cd821290 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -83,6 +83,8 @@ enum {
Opt_nobarrier,
Opt_rgrplvb,
Opt_norgrplvb,
+ Opt_loccookie,
+ Opt_noloccookie,
Opt_error,
};
@@ -122,6 +124,8 @@ static const match_table_t tokens = {
{Opt_nobarrier, "nobarrier"},
{Opt_rgrplvb, "rgrplvb"},
{Opt_norgrplvb, "norgrplvb"},
+ {Opt_loccookie, "loccookie"},
+ {Opt_noloccookie, "noloccookie"},
{Opt_error, NULL}
};
@@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
case Opt_norgrplvb:
args->ar_rgrplvb = 0;
break;
+ case Opt_loccookie:
+ args->ar_loccookie = 1;
+ break;
+ case Opt_noloccookie:
+ args->ar_loccookie = 0;
+ break;
case Opt_error:
default:
pr_warn("invalid mount option: %s\n", o);
@@ -556,6 +566,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
gfs2_trans_add_meta(l_ip->i_gl, l_bh);
+ gfs2_trans_add_meta(m_ip->i_gl, m_bh);
spin_lock(&sdp->sd_statfs_spin);
m_sc->sc_total += l_sc->sc_total;
@@ -564,10 +575,8 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
memset(l_bh->b_data + sizeof(struct gfs2_dinode),
0, sizeof(struct gfs2_statfs_change));
- spin_unlock(&sdp->sd_statfs_spin);
-
- gfs2_trans_add_meta(m_ip->i_gl, m_bh);
gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+ spin_unlock(&sdp->sd_statfs_spin);
}
int gfs2_statfs_sync(struct super_block *sb, int type)
@@ -842,10 +851,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
gfs2_quota_sync(sdp->sd_vfs, 0);
gfs2_statfs_sync(sdp->sd_vfs, 0);
- down_write(&sdp->sd_log_flush_lock);
- clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
- up_write(&sdp->sd_log_flush_lock);
-
gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH);
wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
@@ -1419,6 +1424,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",demote_interface_used");
if (args->ar_rgrplvb)
seq_puts(s, ",rgrplvb");
+ if (args->ar_loccookie)
+ seq_puts(s, ",loccookie");
return 0;
}
@@ -1512,6 +1519,7 @@ static void gfs2_evict_inode(struct inode *inode)
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
+ struct address_space *metamapping;
int error;
if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
@@ -1526,7 +1534,8 @@ static void gfs2_evict_inode(struct inode *inode)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (unlikely(error)) {
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
goto out;
}
@@ -1542,12 +1551,16 @@ static void gfs2_evict_inode(struct inode *inode)
goto out_truncate;
}
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_wait(&ip->i_iopen_gh);
- gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
- error = gfs2_glock_nq(&ip->i_iopen_gh);
- if (error)
- goto out_truncate;
+ if (ip->i_iopen_gh.gh_gl &&
+ test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE,
+ &ip->i_iopen_gh);
+ error = gfs2_glock_nq(&ip->i_iopen_gh);
+ if (error)
+ goto out_truncate;
+ }
/* Case 1 starts here */
@@ -1575,8 +1588,8 @@ static void gfs2_evict_inode(struct inode *inode)
out_truncate:
gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
+ metamapping = gfs2_glock2aspace(ip->i_gl);
if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
- struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
filemap_fdatawrite(metamapping);
filemap_fdatawait(metamapping);
}
@@ -1589,25 +1602,28 @@ out_truncate:
goto out_unlock;
/* Needs to be done before glock release & also in a transaction */
truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages(metamapping, 0);
gfs2_trans_end(sdp);
out_unlock:
/* Error path for case 1 */
- if (gfs2_rs_active(ip->i_res))
- gfs2_rs_deltree(ip->i_res);
+ if (gfs2_rs_active(&ip->i_res))
+ gfs2_rs_deltree(&ip->i_res);
- if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq(&ip->i_iopen_gh);
+ if (ip->i_iopen_gh.gh_gl) {
+ if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ }
+ gfs2_holder_uninit(&ip->i_iopen_gh);
}
- gfs2_holder_uninit(&ip->i_iopen_gh);
gfs2_glock_dq_uninit(&gh);
if (error && error != GLR_TRYFAILED && error != -EROFS)
fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
out:
/* Case 3 starts here */
truncate_inode_pages_final(&inode->i_data);
- gfs2_rs_delete(ip, NULL);
+ gfs2_rsqa_delete(ip, NULL);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
@@ -1619,7 +1635,8 @@ out:
if (ip->i_iopen_gh.gh_gl) {
ip->i_iopen_gh.gh_gl->gl_object = NULL;
ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_wait(&ip->i_iopen_gh);
+ gfs2_holder_uninit(&ip->i_iopen_gh);
}
}
@@ -1632,7 +1649,9 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
ip->i_flags = 0;
ip->i_gl = NULL;
ip->i_rgd = NULL;
- ip->i_res = NULL;
+ memset(&ip->i_res, 0, sizeof(ip->i_res));
+ RB_CLEAR_NODE(&ip->i_res.rs_node);
+ ip->i_rahead = 0;
}
return &ip->i_inode;
}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index b95d0d625f32..0c1bde395062 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -176,6 +176,8 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
unlock_buffer(bh);
if (bh->b_private == NULL)
bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+ else
+ bd = bh->b_private;
lock_buffer(bh);
gfs2_log_lock(sdp);
}
@@ -236,6 +238,8 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
lock_page(bh->b_page);
if (bh->b_private == NULL)
bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+ else
+ bd = bh->b_private;
unlock_page(bh->b_page);
lock_buffer(bh);
gfs2_log_lock(sdp);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 86d2035ac669..cf645835710f 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -27,7 +27,7 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly;
struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
struct kmem_cache *gfs2_quotad_cachep __read_mostly;
-struct kmem_cache *gfs2_rsrv_cachep __read_mostly;
+struct kmem_cache *gfs2_qadata_cachep __read_mostly;
mempool_t *gfs2_page_pool __read_mostly;
void gfs2_assert_i(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index cbdcbdf39614..c81295f407f6 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -149,7 +149,7 @@ extern struct kmem_cache *gfs2_inode_cachep;
extern struct kmem_cache *gfs2_bufdata_cachep;
extern struct kmem_cache *gfs2_rgrpd_cachep;
extern struct kmem_cache *gfs2_quotad_cachep;
-extern struct kmem_cache *gfs2_rsrv_cachep;
+extern struct kmem_cache *gfs2_qadata_cachep;
extern mempool_t *gfs2_page_pool;
static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4c096fa9e2a1..e8dfb4740c04 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -119,7 +119,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
__be64 *eablk, *end;
int error;
- error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh);
if (error)
return error;
@@ -143,7 +143,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
break;
bn = be64_to_cpu(*eablk);
- error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+ error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh);
if (error)
break;
error = ea_foreach_i(ip, eabh, ea_call, data);
@@ -477,7 +477,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
return -ENOMEM;
for (x = 0; x < nptrs; x++) {
- error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+ error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0,
bh + x);
if (error) {
while (x--)
@@ -583,11 +583,13 @@ out:
*
* Returns: actual size of data on success, -errno on error
*/
-static int gfs2_xattr_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int gfs2_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
struct gfs2_ea_location el;
+ int type = handler->flags;
int error;
if (!ip->i_eattr)
@@ -977,7 +979,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
__be64 *end;
- error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0,
&indbh);
if (error)
return error;
@@ -1227,61 +1229,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
return error;
}
-static int gfs2_xattr_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int gfs2_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
return __gfs2_xattr_set(d_inode(dentry), name, value,
- size, flags, type);
-}
-
-
-static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
- struct gfs2_ea_header *ea, char *data)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- unsigned int amount = GFS2_EA_DATA_LEN(ea);
- unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
- int ret;
-
- ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
- if (ret)
- return ret;
-
- ret = gfs2_iter_unstuffed(ip, ea, data, NULL);
- gfs2_trans_end(sdp);
-
- return ret;
-}
-
-int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
-{
- struct inode *inode = &ip->i_inode;
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct gfs2_ea_location el;
- int error;
-
- error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
- if (error)
- return error;
-
- if (GFS2_EA_IS_STUFFED(el.el_ea)) {
- error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
- if (error == 0) {
- gfs2_trans_add_meta(ip->i_gl, el.el_bh);
- memcpy(GFS2_EA2DATA(el.el_ea), data,
- GFS2_EA_DATA_LEN(el.el_ea));
- }
- } else {
- error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
- }
-
- brelse(el.el_bh);
- if (error)
- return error;
-
- error = gfs2_setattr_simple(inode, attr);
- gfs2_trans_end(sdp);
- return error;
+ size, flags, handler->flags);
}
static int ea_dealloc_indirect(struct gfs2_inode *ip)
@@ -1303,7 +1256,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
- error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
+ error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh);
if (error)
return error;
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index d392f8358f2f..2d887c88eb49 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -62,6 +62,5 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
/* Exported to acl.c */
extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
-extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 221719eac5de..d77d844b668b 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -278,14 +278,14 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
mapping = tree->inode->i_mapping;
off = (loff_t)cnid * tree->node_size;
- block = off >> PAGE_CACHE_SHIFT;
- node->page_offset = off & ~PAGE_CACHE_MASK;
+ block = off >> PAGE_SHIFT;
+ node->page_offset = off & ~PAGE_MASK;
for (i = 0; i < tree->pages_per_bnode; i++) {
page = read_mapping_page(mapping, block++, NULL);
if (IS_ERR(page))
goto fail;
if (PageError(page)) {
- page_cache_release(page);
+ put_page(page);
goto fail;
}
node->page[i] = page;
@@ -401,7 +401,7 @@ void hfs_bnode_free(struct hfs_bnode *node)
for (i = 0; i < node->tree->pages_per_bnode; i++)
if (node->page[i])
- page_cache_release(node->page[i]);
+ put_page(node->page[i]);
kfree(node);
}
@@ -429,11 +429,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
pagep = node->page;
memset(kmap(*pagep) + node->page_offset, 0,
- min((int)PAGE_CACHE_SIZE, (int)tree->node_size));
+ min((int)PAGE_SIZE, (int)tree->node_size));
set_page_dirty(*pagep);
kunmap(*pagep);
for (i = 1; i < tree->pages_per_bnode; i++) {
- memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE);
+ memset(kmap(*++pagep), 0, PAGE_SIZE);
set_page_dirty(*pagep);
kunmap(*pagep);
}
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 1ab19e660e69..37cdd955eceb 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -116,14 +116,14 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
}
tree->node_size_shift = ffs(size) - 1;
- tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
kunmap(page);
- page_cache_release(page);
+ put_page(page);
return tree;
fail_page:
- page_cache_release(page);
+ put_page(page);
free_inode:
tree->inode->i_mapping->a_ops = &hfs_aops;
iput(tree->inode);
@@ -257,9 +257,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off = off16;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+ pagep = node->page + (off >> PAGE_SHIFT);
data = kmap(*pagep);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
idx = 0;
for (;;) {
@@ -279,7 +279,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
}
}
}
- if (++off >= PAGE_CACHE_SIZE) {
+ if (++off >= PAGE_SIZE) {
kunmap(*pagep);
data = kmap(*++pagep);
off = 0;
@@ -302,9 +302,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
len = hfs_brec_lenoff(node, 0, &off16);
off = off16;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+ pagep = node->page + (off >> PAGE_SHIFT);
data = kmap(*pagep);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
}
}
@@ -348,9 +348,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
len = hfs_brec_lenoff(node, 0, &off);
}
off += node->page_offset + nidx / 8;
- page = node->page[off >> PAGE_CACHE_SHIFT];
+ page = node->page[off >> PAGE_SHIFT];
data = kmap(page);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
m = 1 << (~nidx & 7);
byte = data[off];
if (!(byte & m)) {
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index db458ee3a546..1eb5d415d434 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -214,7 +214,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
{
struct super_block *sb;
struct hfs_find_data fd;
- struct list_head *pos;
+ struct hfs_readdir_data *rd;
int res, type;
hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
@@ -240,9 +240,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
}
}
- list_for_each(pos, &HFS_I(dir)->open_dir_list) {
- struct hfs_readdir_data *rd =
- list_entry(pos, struct hfs_readdir_data, list);
+ list_for_each_entry(rd, &HFS_I(dir)->open_dir_list, list) {
if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
rd->file->f_pos--;
}
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 70788e03820a..e9f2b855f831 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -173,9 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
{
struct hfs_readdir_data *rd = file->private_data;
if (rd) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
list_del(&rd->list);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
kfree(rd);
}
return 0;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index b99ebddb10cb..cb1e5faa2fb7 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -91,8 +91,8 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
if (!tree)
return 0;
- if (tree->node_size >= PAGE_CACHE_SIZE) {
- nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
+ if (tree->node_size >= PAGE_SIZE) {
+ nidx = page->index >> (tree->node_size_shift - PAGE_SHIFT);
spin_lock(&tree->hash_lock);
node = hfs_bnode_findhash(tree, nidx);
if (!node)
@@ -105,8 +105,8 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
}
spin_unlock(&tree->hash_lock);
} else {
- nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
- i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+ nidx = page->index << (PAGE_SHIFT - tree->node_size_shift);
+ i = 1 << (PAGE_SHIFT - tree->node_size_shift);
spin_lock(&tree->hash_lock);
do {
node = hfs_bnode_findhash(tree, nidx++);
@@ -570,13 +570,13 @@ static int hfs_file_release(struct inode *inode, struct file *file)
if (HFS_IS_RSRC(inode))
inode = HFS_I(inode)->rsrc_inode;
if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
hfs_file_truncate(inode);
//if (inode->i_flags & S_DEAD) {
// hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
// hfs_delete_inode(inode);
//}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
}
@@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* sync the inode to buffers */
ret = write_inode_now(inode, 0);
@@ -668,7 +668,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
err = sync_blockdev(sb->s_bdev);
if (!ret)
ret = err;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index aa3f0d6d043c..a3ec3ae7d347 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -166,7 +166,7 @@ int hfs_mdb_get(struct super_block *sb)
pr_warn("continuing without an alternate MDB\n");
}
- HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0);
+ HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL);
if (!HFS_SB(sb)->bitmap)
goto out;
@@ -360,7 +360,7 @@ void hfs_mdb_put(struct super_block *sb)
unload_nls(HFS_SB(sb)->nls_io);
unload_nls(HFS_SB(sb)->nls_disk);
- free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
+ kfree(HFS_SB(sb)->bitmap);
kfree(HFS_SB(sb));
sb->s_fs_info = NULL;
}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4574fdd3d421..1ca95c232bb5 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -483,8 +483,8 @@ static int __init init_hfs_fs(void)
int err;
hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
- sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
- hfs_init_once);
+ sizeof(struct hfs_inode_info), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
if (!hfs_inode_cachep)
return -ENOMEM;
err = register_filesystem(&hfs_fs_type);
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index d2954451519e..c0ae274c0a22 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -13,7 +13,7 @@
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
-#define PAGE_CACHE_BITS (PAGE_CACHE_SIZE * 8)
+#define PAGE_CACHE_BITS (PAGE_SIZE * 8)
int hfsplus_block_allocate(struct super_block *sb, u32 size,
u32 offset, u32 *max)
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 63924662aaf3..ce014ceb89ef 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -24,16 +24,16 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
int l;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
- off &= ~PAGE_CACHE_MASK;
+ pagep = node->page + (off >> PAGE_SHIFT);
+ off &= ~PAGE_MASK;
- l = min_t(int, len, PAGE_CACHE_SIZE - off);
+ l = min_t(int, len, PAGE_SIZE - off);
memcpy(buf, kmap(*pagep) + off, l);
kunmap(*pagep);
while ((len -= l) != 0) {
buf += l;
- l = min_t(int, len, PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_SIZE);
memcpy(buf, kmap(*++pagep), l);
kunmap(*pagep);
}
@@ -77,17 +77,17 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
int l;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
- off &= ~PAGE_CACHE_MASK;
+ pagep = node->page + (off >> PAGE_SHIFT);
+ off &= ~PAGE_MASK;
- l = min_t(int, len, PAGE_CACHE_SIZE - off);
+ l = min_t(int, len, PAGE_SIZE - off);
memcpy(kmap(*pagep) + off, buf, l);
set_page_dirty(*pagep);
kunmap(*pagep);
while ((len -= l) != 0) {
buf += l;
- l = min_t(int, len, PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_SIZE);
memcpy(kmap(*++pagep), buf, l);
set_page_dirty(*pagep);
kunmap(*pagep);
@@ -107,16 +107,16 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
int l;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
- off &= ~PAGE_CACHE_MASK;
+ pagep = node->page + (off >> PAGE_SHIFT);
+ off &= ~PAGE_MASK;
- l = min_t(int, len, PAGE_CACHE_SIZE - off);
+ l = min_t(int, len, PAGE_SIZE - off);
memset(kmap(*pagep) + off, 0, l);
set_page_dirty(*pagep);
kunmap(*pagep);
while ((len -= l) != 0) {
- l = min_t(int, len, PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_SIZE);
memset(kmap(*++pagep), 0, l);
set_page_dirty(*pagep);
kunmap(*pagep);
@@ -136,20 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
tree = src_node->tree;
src += src_node->page_offset;
dst += dst_node->page_offset;
- src_page = src_node->page + (src >> PAGE_CACHE_SHIFT);
- src &= ~PAGE_CACHE_MASK;
- dst_page = dst_node->page + (dst >> PAGE_CACHE_SHIFT);
- dst &= ~PAGE_CACHE_MASK;
+ src_page = src_node->page + (src >> PAGE_SHIFT);
+ src &= ~PAGE_MASK;
+ dst_page = dst_node->page + (dst >> PAGE_SHIFT);
+ dst &= ~PAGE_MASK;
if (src == dst) {
- l = min_t(int, len, PAGE_CACHE_SIZE - src);
+ l = min_t(int, len, PAGE_SIZE - src);
memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l);
kunmap(*src_page);
set_page_dirty(*dst_page);
kunmap(*dst_page);
while ((len -= l) != 0) {
- l = min_t(int, len, PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_SIZE);
memcpy(kmap(*++dst_page), kmap(*++src_page), l);
kunmap(*src_page);
set_page_dirty(*dst_page);
@@ -161,12 +161,12 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
do {
src_ptr = kmap(*src_page) + src;
dst_ptr = kmap(*dst_page) + dst;
- if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) {
- l = PAGE_CACHE_SIZE - src;
+ if (PAGE_SIZE - src < PAGE_SIZE - dst) {
+ l = PAGE_SIZE - src;
src = 0;
dst += l;
} else {
- l = PAGE_CACHE_SIZE - dst;
+ l = PAGE_SIZE - dst;
src += l;
dst = 0;
}
@@ -195,11 +195,11 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
dst += node->page_offset;
if (dst > src) {
src += len - 1;
- src_page = node->page + (src >> PAGE_CACHE_SHIFT);
- src = (src & ~PAGE_CACHE_MASK) + 1;
+ src_page = node->page + (src >> PAGE_SHIFT);
+ src = (src & ~PAGE_MASK) + 1;
dst += len - 1;
- dst_page = node->page + (dst >> PAGE_CACHE_SHIFT);
- dst = (dst & ~PAGE_CACHE_MASK) + 1;
+ dst_page = node->page + (dst >> PAGE_SHIFT);
+ dst = (dst & ~PAGE_MASK) + 1;
if (src == dst) {
while (src < len) {
@@ -208,7 +208,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
set_page_dirty(*dst_page);
kunmap(*dst_page);
len -= src;
- src = PAGE_CACHE_SIZE;
+ src = PAGE_SIZE;
src_page--;
dst_page--;
}
@@ -226,32 +226,32 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
dst_ptr = kmap(*dst_page) + dst;
if (src < dst) {
l = src;
- src = PAGE_CACHE_SIZE;
+ src = PAGE_SIZE;
dst -= l;
} else {
l = dst;
src -= l;
- dst = PAGE_CACHE_SIZE;
+ dst = PAGE_SIZE;
}
l = min(len, l);
memmove(dst_ptr - l, src_ptr - l, l);
kunmap(*src_page);
set_page_dirty(*dst_page);
kunmap(*dst_page);
- if (dst == PAGE_CACHE_SIZE)
+ if (dst == PAGE_SIZE)
dst_page--;
else
src_page--;
} while ((len -= l));
}
} else {
- src_page = node->page + (src >> PAGE_CACHE_SHIFT);
- src &= ~PAGE_CACHE_MASK;
- dst_page = node->page + (dst >> PAGE_CACHE_SHIFT);
- dst &= ~PAGE_CACHE_MASK;
+ src_page = node->page + (src >> PAGE_SHIFT);
+ src &= ~PAGE_MASK;
+ dst_page = node->page + (dst >> PAGE_SHIFT);
+ dst &= ~PAGE_MASK;
if (src == dst) {
- l = min_t(int, len, PAGE_CACHE_SIZE - src);
+ l = min_t(int, len, PAGE_SIZE - src);
memmove(kmap(*dst_page) + src,
kmap(*src_page) + src, l);
kunmap(*src_page);
@@ -259,7 +259,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
kunmap(*dst_page);
while ((len -= l) != 0) {
- l = min_t(int, len, PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_SIZE);
memmove(kmap(*++dst_page),
kmap(*++src_page), l);
kunmap(*src_page);
@@ -272,13 +272,13 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
do {
src_ptr = kmap(*src_page) + src;
dst_ptr = kmap(*dst_page) + dst;
- if (PAGE_CACHE_SIZE - src <
- PAGE_CACHE_SIZE - dst) {
- l = PAGE_CACHE_SIZE - src;
+ if (PAGE_SIZE - src <
+ PAGE_SIZE - dst) {
+ l = PAGE_SIZE - src;
src = 0;
dst += l;
} else {
- l = PAGE_CACHE_SIZE - dst;
+ l = PAGE_SIZE - dst;
src += l;
dst = 0;
}
@@ -444,14 +444,14 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
mapping = tree->inode->i_mapping;
off = (loff_t)cnid << tree->node_size_shift;
- block = off >> PAGE_CACHE_SHIFT;
- node->page_offset = off & ~PAGE_CACHE_MASK;
+ block = off >> PAGE_SHIFT;
+ node->page_offset = off & ~PAGE_MASK;
for (i = 0; i < tree->pages_per_bnode; block++, i++) {
page = read_mapping_page(mapping, block, NULL);
if (IS_ERR(page))
goto fail;
if (PageError(page)) {
- page_cache_release(page);
+ put_page(page);
goto fail;
}
node->page[i] = page;
@@ -569,7 +569,7 @@ void hfs_bnode_free(struct hfs_bnode *node)
for (i = 0; i < node->tree->pages_per_bnode; i++)
if (node->page[i])
- page_cache_release(node->page[i]);
+ put_page(node->page[i]);
kfree(node);
}
@@ -597,11 +597,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
pagep = node->page;
memset(kmap(*pagep) + node->page_offset, 0,
- min_t(int, PAGE_CACHE_SIZE, tree->node_size));
+ min_t(int, PAGE_SIZE, tree->node_size));
set_page_dirty(*pagep);
kunmap(*pagep);
for (i = 1; i < tree->pages_per_bnode; i++) {
- memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE);
+ memset(kmap(*++pagep), 0, PAGE_SIZE);
set_page_dirty(*pagep);
kunmap(*pagep);
}
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 3345c7553edc..d9d1a36ba826 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -236,15 +236,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
tree->node_size_shift = ffs(size) - 1;
tree->pages_per_bnode =
- (tree->node_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ (tree->node_size + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
kunmap(page);
- page_cache_release(page);
+ put_page(page);
return tree;
fail_page:
- page_cache_release(page);
+ put_page(page);
free_inode:
tree->inode->i_mapping->a_ops = &hfsplus_aops;
iput(tree->inode);
@@ -380,9 +380,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off = off16;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+ pagep = node->page + (off >> PAGE_SHIFT);
data = kmap(*pagep);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
idx = 0;
for (;;) {
@@ -403,7 +403,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
}
}
}
- if (++off >= PAGE_CACHE_SIZE) {
+ if (++off >= PAGE_SIZE) {
kunmap(*pagep);
data = kmap(*++pagep);
off = 0;
@@ -426,9 +426,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
len = hfs_brec_lenoff(node, 0, &off16);
off = off16;
off += node->page_offset;
- pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+ pagep = node->page + (off >> PAGE_SHIFT);
data = kmap(*pagep);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
}
}
@@ -475,9 +475,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
len = hfs_brec_lenoff(node, 0, &off);
}
off += node->page_offset + nidx / 8;
- page = node->page[off >> PAGE_CACHE_SHIFT];
+ page = node->page[off >> PAGE_SHIFT];
data = kmap(page);
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
m = 1 << (~nidx & 7);
byte = data[off];
if (!(byte & m)) {
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d0f39dcbb58e..a4e867e08947 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -284,9 +284,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
{
struct hfsplus_readdir_data *rd = file->private_data;
if (rd) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
list_del(&rd->list);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
kfree(rd);
}
return 0;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 6dd107d7421e..b28f39865c3a 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -87,9 +87,9 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
}
if (!tree)
return 0;
- if (tree->node_size >= PAGE_CACHE_SIZE) {
+ if (tree->node_size >= PAGE_SIZE) {
nidx = page->index >>
- (tree->node_size_shift - PAGE_CACHE_SHIFT);
+ (tree->node_size_shift - PAGE_SHIFT);
spin_lock(&tree->hash_lock);
node = hfs_bnode_findhash(tree, nidx);
if (!node)
@@ -103,8 +103,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
spin_unlock(&tree->hash_lock);
} else {
nidx = page->index <<
- (PAGE_CACHE_SHIFT - tree->node_size_shift);
- i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+ (PAGE_SHIFT - tree->node_size_shift);
+ i = 1 << (PAGE_SHIFT - tree->node_size_shift);
spin_lock(&tree->hash_lock);
do {
node = hfs_bnode_findhash(tree, nidx++);
@@ -229,14 +229,14 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
if (HFSPLUS_IS_RSRC(inode))
inode = HFSPLUS_I(inode)->rsrc_inode;
if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
hfsplus_file_truncate(inode);
if (inode->i_flags & S_DEAD) {
hfsplus_delete_cat(inode->i_ino,
HFSPLUS_SB(sb)->hidden_dir, NULL);
hfsplus_delete_inode(inode);
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
}
@@ -286,7 +286,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
error = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (error)
return error;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* Sync inode metadata into the catalog and extent trees.
@@ -327,7 +327,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
@@ -403,6 +403,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
} else if (S_ISLNK(inode->i_mode)) {
sbi->file_count++;
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &hfsplus_aops;
hip->clump_blocks = 1;
} else
@@ -526,6 +527,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
inode->i_mapping->a_ops = &hfsplus_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &hfsplus_aops;
} else {
init_special_inode(inode, inode->i_mode,
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 0624ce4e0702..32a49e292b6a 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -93,7 +93,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
goto out_drop_write;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
@@ -126,7 +126,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
mark_inode_dirty(inode);
out_unlock_inode:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out_drop_write:
mnt_drop_write_file(file);
out:
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index df0c9af68d05..afb33eda6d7d 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -21,10 +21,10 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- xattr_name = POSIX_ACL_XATTR_ACCESS;
+ xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- xattr_name = POSIX_ACL_XATTR_DEFAULT;
+ xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return ERR_PTR(-EINVAL);
@@ -66,7 +66,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
switch (type) {
case ACL_TYPE_ACCESS:
- xattr_name = POSIX_ACL_XATTR_ACCESS;
+ xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
err = posix_acl_equiv_mode(acl, &inode->i_mode);
if (err < 0)
@@ -76,7 +76,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
break;
case ACL_TYPE_DEFAULT:
- xattr_name = POSIX_ACL_XATTR_DEFAULT;
+ xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
break;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7302d96ae8bf..c35911362ff9 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -438,7 +438,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
err = -EFBIG;
last_fs_block = sbi->total_blocks - 1;
last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
(last_fs_page > (pgoff_t)(~0ULL))) {
@@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void)
int err;
hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
- HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+ HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
hfsplus_init_once);
if (!hfsplus_inode_cachep)
return -ENOMEM;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 416b1dbafe51..70e445ff0cff 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -220,7 +220,7 @@ check_attr_tree_state_again:
index = 0;
written = 0;
- for (; written < node_size; index++, written += PAGE_CACHE_SIZE) {
+ for (; written < node_size; index++, written += PAGE_SIZE) {
void *kaddr;
page = read_mapping_page(mapping, index, NULL);
@@ -231,11 +231,11 @@ check_attr_tree_state_again:
kaddr = kmap_atomic(page);
memcpy(kaddr, buf + written,
- min_t(size_t, PAGE_CACHE_SIZE, node_size - written));
+ min_t(size_t, PAGE_SIZE, node_size - written));
kunmap_atomic(kaddr);
set_page_dirty(page);
- page_cache_release(page);
+ put_page(page);
}
hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY);
@@ -431,9 +431,6 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
char *xattr_name;
int res;
- if (!strcmp(name, ""))
- return -EINVAL;
-
xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
GFP_KERNEL);
if (!xattr_name)
@@ -589,9 +586,6 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
int res;
char *xattr_name;
- if (!strcmp(name, ""))
- return -EINVAL;
-
xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
GFP_KERNEL);
if (!xattr_name)
@@ -849,12 +843,10 @@ end_removexattr:
return err;
}
-static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
/*
* Don't allow retrieving properly prefixed attributes
* by prepending them with "osx."
@@ -871,12 +863,10 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
return __hfsplus_getxattr(d_inode(dentry), name, buffer, size);
}
-static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
/*
* Don't allow setting properly prefixed attributes
* by prepending them with "osx."
@@ -893,19 +883,8 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
}
-static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_osx_handler = {
.prefix = XATTR_MAC_OSX_PREFIX,
- .list = hfsplus_osx_listxattr,
.get = hfsplus_osx_getxattr,
.set = hfsplus_osx_setxattr,
};
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index aacff00a9ff9..72a68a3a0c99 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -13,32 +13,24 @@
#include "xattr.h"
#include "acl.h"
-static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_security_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
-static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_security_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
-static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
static int hfsplus_initxattrs(struct inode *inode,
const struct xattr *xattr_array,
void *fs_info)
@@ -92,7 +84,6 @@ int hfsplus_init_inode_security(struct inode *inode,
const struct xattr_handler hfsplus_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = hfsplus_security_listxattr,
.get = hfsplus_security_getxattr,
.set = hfsplus_security_setxattr,
};
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index bcf65089b7f7..95a7704c7abb 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -11,34 +11,25 @@
#include "hfsplus_fs.h"
#include "xattr.h"
-static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_TRUSTED_PREFIX,
XATTR_TRUSTED_PREFIX_LEN);
}
-static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
-static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = hfsplus_trusted_listxattr,
.get = hfsplus_trusted_getxattr,
.set = hfsplus_trusted_setxattr,
};
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 5aa0e6dc4a1e..6fc269baf959 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -11,34 +11,25 @@
#include "hfsplus_fs.h"
#include "xattr.h"
-static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_user_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
-static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_user_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
-static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = hfsplus_user_listxattr,
.get = hfsplus_user_getxattr,
.set = hfsplus_user_setxattr,
};
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2ac99db3750e..7016653f3e41 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
- hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+ hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
@@ -378,9 +378,9 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = fsync_file(HOSTFS_I(inode)->fd, datasync);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -410,12 +410,12 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = mapping->host;
char *buffer;
loff_t base = page_offset(page);
- int count = PAGE_CACHE_SIZE;
- int end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ int count = PAGE_SIZE;
+ int end_index = inode->i_size >> PAGE_SHIFT;
int err;
if (page->index >= end_index)
- count = inode->i_size & (PAGE_CACHE_SIZE-1);
+ count = inode->i_size & (PAGE_SIZE-1);
buffer = kmap(page);
@@ -447,7 +447,7 @@ static int hostfs_readpage(struct file *file, struct page *page)
buffer = kmap(page);
bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
if (bytes_read < 0) {
ClearPageUptodate(page);
SetPageError(page);
@@ -455,7 +455,7 @@ static int hostfs_readpage(struct file *file, struct page *page)
goto out;
}
- memset(buffer + bytes_read, 0, PAGE_CACHE_SIZE - bytes_read);
+ memset(buffer + bytes_read, 0, PAGE_SIZE - bytes_read);
ClearPageError(page);
SetPageUptodate(page);
@@ -471,7 +471,7 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
*pagep = grab_cache_page_write_begin(mapping, index, flags);
if (!*pagep)
@@ -485,14 +485,14 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
void *buffer;
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
int err;
buffer = kmap(page);
err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied);
kunmap(page);
- if (!PageUptodate(page) && err == PAGE_CACHE_SIZE)
+ if (!PageUptodate(page) && err == PAGE_SIZE)
SetPageUptodate(page);
/*
@@ -502,7 +502,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
if (err > 0 && (pos > inode->i_size))
inode->i_size = pos;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -730,15 +730,13 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
init_special_inode(inode, mode, dev);
err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
- if (!err)
+ if (err)
goto out_free;
err = read_name(inode, name);
__putname(name);
if (err)
goto out_put;
- if (err)
- goto out_put;
d_instantiate(dentry, inode);
return 0;
@@ -892,9 +890,14 @@ static const struct inode_operations hostfs_dir_iops = {
.setattr = hostfs_setattr,
};
-static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *hostfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- char *link = __getname();
+ char *link;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ link = kmalloc(PATH_MAX, GFP_KERNEL);
if (link) {
char *path = dentry_name(dentry);
int err = -ENOMEM;
@@ -905,25 +908,20 @@ static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
__putname(path);
}
if (err < 0) {
- __putname(link);
+ kfree(link);
return ERR_PTR(err);
}
} else {
return ERR_PTR(-ENOMEM);
}
- return *cookie = link;
-}
-
-static void hostfs_put_link(struct inode *unused, void *cookie)
-{
- __putname(cookie);
+ set_delayed_call(done, kfree_link, link);
+ return link;
}
static const struct inode_operations hostfs_link_iops = {
.readlink = generic_readlink,
- .follow_link = hostfs_follow_link,
- .put_link = hostfs_put_link,
+ .get_link = hostfs_get_link,
};
static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index dc540bfcee1d..e57a53c13d86 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -33,7 +33,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
if (whence == SEEK_DATA || whence == SEEK_HOLE)
return -EINVAL;
- mutex_lock(&i->i_mutex);
+ inode_lock(i);
hpfs_lock(s);
/*pr_info("dir lseek\n");*/
@@ -48,12 +48,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
ok:
filp->f_pos = new_off;
hpfs_unlock(s);
- mutex_unlock(&i->i_mutex);
+ inode_unlock(i);
return new_off;
fail:
/*pr_warn("illegal lseek: %016llx\n", new_off);*/
hpfs_unlock(s);
- mutex_unlock(&i->i_mutex);
+ inode_unlock(i);
return -ESPIPE;
}
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 933c73780813..1f3c6d76200b 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -77,6 +77,7 @@ void hpfs_read_inode(struct inode *i)
kfree(ea);
i->i_mode = S_IFLNK | 0777;
i->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(i);
i->i_data.a_ops = &hpfs_symlink_aops;
set_nlink(i, 1);
i->i_size = ea_size;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index a69bbc1e87f8..a136929189f0 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -133,7 +133,7 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock)
{
struct quad_buffer_head qbh;
- u32 *directory;
+ __le32 *directory;
u32 n_hotfixes, n_used_hotfixes;
unsigned i;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 9e92c9c2d319..bb8d67e2740a 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -227,8 +227,6 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de
int err;
if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
- if (!new_valid_dev(rdev))
- return -EINVAL;
hpfs_lock(dir->i_sb);
err = -ENOSPC;
fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -334,6 +332,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
result->i_blocks = 1;
set_nlink(result, 1);
result->i_size = strlen(symlink);
+ inode_nohighmem(result);
result->i_op = &page_symlink_inode_operations;
result->i_data.a_ops = &hpfs_symlink_aops;
@@ -377,12 +376,11 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
dnode_secno dno;
int r;
- int rep = 0;
int err;
hpfs_lock(dir->i_sb);
hpfs_adjust_length(name, &len);
-again:
+
err = -ENOENT;
de = map_dirent(dir, hpfs_i(dir)->i_dno, name, len, &dno, &qbh);
if (!de)
@@ -402,33 +400,9 @@ again:
hpfs_error(dir->i_sb, "there was error when removing dirent");
err = -EFSERROR;
break;
- case 2: /* no space for deleting, try to truncate file */
-
+ case 2: /* no space for deleting */
err = -ENOSPC;
- if (rep++)
- break;
-
- dentry_unhash(dentry);
- if (!d_unhashed(dentry)) {
- hpfs_unlock(dir->i_sb);
- return -ENOSPC;
- }
- if (generic_permission(inode, MAY_WRITE) ||
- !S_ISREG(inode->i_mode) ||
- get_write_access(inode)) {
- d_rehash(dentry);
- } else {
- struct iattr newattrs;
- /*pr_info("truncating file before delete.\n");*/
- newattrs.ia_size = 0;
- newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
- err = notify_change(dentry, &newattrs, NULL);
- put_write_access(inode);
- if (!err)
- goto again;
- }
- hpfs_unlock(dir->i_sb);
- return -ENOSPC;
+ break;
default:
drop_nlink(inode);
err = 0;
@@ -502,7 +476,7 @@ out:
static int hpfs_symlink_readpage(struct file *file, struct page *page)
{
- char *link = kmap(page);
+ char *link = page_address(page);
struct inode *i = page->mapping->host;
struct fnode *fnode;
struct buffer_head *bh;
@@ -518,14 +492,12 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
goto fail;
hpfs_unlock(i->i_sb);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
fail:
hpfs_unlock(i->i_sb);
SetPageError(page);
- kunmap(page);
unlock_page(page);
return err;
}
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a561591896bd..458cf463047b 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -261,7 +261,7 @@ static int init_inodecache(void)
hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
sizeof(struct hpfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (hpfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 316adb968b65..4ea71eba40a5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -4,11 +4,11 @@
* Nadia Yvette Chambers, 2002
*
* Copyright (C) 2002 Linus Torvalds.
+ * License: GPL
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
@@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
file_accessed(file);
ret = -ENOMEM;
@@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (vma->vm_flags & VM_WRITE && inode->i_size < len)
inode->i_size = len;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -213,12 +213,12 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
int i, chunksize;
/* Find which 4k chunk and offset with in that chunk */
- i = offset >> PAGE_CACHE_SHIFT;
- offset = offset & ~PAGE_CACHE_MASK;
+ i = offset >> PAGE_SHIFT;
+ offset = offset & ~PAGE_MASK;
while (size) {
size_t n;
- chunksize = PAGE_CACHE_SIZE;
+ chunksize = PAGE_SIZE;
if (offset)
chunksize -= offset;
if (chunksize > size)
@@ -237,7 +237,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
/*
* Support for read() - Find the page attached to f_mapping and copy out the
* data. Its *very* similar to do_generic_mapping_read(), we can't use that
- * since it has PAGE_CACHE_SIZE assumptions.
+ * since it has PAGE_SIZE assumptions.
*/
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -285,7 +285,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
* We have the page, copy it to user space buffer.
*/
copied = hugetlbfs_read_actor(page, offset, to, nr);
- page_cache_release(page);
+ put_page(page);
}
offset += copied;
retval += copied;
@@ -324,20 +324,62 @@ static void remove_huge_page(struct page *page)
delete_from_page_cache(page);
}
+static void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+ struct vm_area_struct *vma;
+
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+ unsigned long v_offset;
+ unsigned long v_end;
+
+ /*
+ * Can the expression below overflow on 32-bit arches?
+ * No, because the interval tree returns us only those vmas
+ * which overlap the truncated area starting at pgoff,
+ * and no vma on a 32-bit arch can span beyond the 4GB.
+ */
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+ else
+ v_offset = 0;
+
+ if (!end)
+ v_end = vma->vm_end;
+ else {
+ v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+ + vma->vm_start;
+ if (v_end > vma->vm_end)
+ v_end = vma->vm_end;
+ }
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+ NULL);
+ }
+}
/*
* remove_inode_hugepages handles two distinct cases: truncation and hole
* punch. There are subtle differences in operation for each case.
-
+ *
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
- * maps and global counts.
+ * maps and global counts. Page faults can not race with truncation
+ * in this routine. hugetlb_no_page() prevents page faults in the
+ * truncated range. It checks i_size before allocation, and again after
+ * with the page table lock for the page held. The same lock must be
+ * acquired to unmap a page.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
* Only when releasing a page is the associated region/reserv map
* deleted. The region/reserv map for ranges without associated
- * pages are not modified.
+ * pages are not modified. Page faults can race with hole punch.
+ * This is indicated if we find a mapped page.
* Note: If the passed end of range value is beyond the end of file, but
* not LLONG_MAX this routine still performs a hole punch operation.
*/
@@ -361,77 +403,81 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
next = start;
while (next < end) {
/*
- * Make sure to never grab more pages that we
- * might possibly need.
+ * Don't grab more pages than the number left in the range.
*/
if (end - next < lookup_nr)
lookup_nr = end - next;
/*
- * This pagevec_lookup() may return pages past 'end',
- * so we must check for page->index > end.
+ * When no more pages are found, we are done.
*/
- if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
- if (next == start)
- break;
- next = start;
- continue;
- }
+ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
+ break;
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ bool rsv_on_error;
u32 hash;
+ /*
+ * The page (index) could be beyond end. This is
+ * only possible in the punch hole case as end is
+ * max page offset in the truncate case.
+ */
+ next = page->index;
+ if (next >= end)
+ break;
+
hash = hugetlb_fault_mutex_hash(h, current->mm,
&pseudo_vma,
mapping, next, 0);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
- lock_page(page);
- if (page->index >= end) {
- unlock_page(page);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- next = end; /* we are done */
- break;
- }
-
/*
* If page is mapped, it was faulted in after being
- * unmapped. Do nothing in this race case. In the
- * normal case page is not mapped.
+ * unmapped in caller. Unmap (again) now after taking
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the page.
+ *
+ * This race can only happen in the hole punch case.
+ * Getting here in a truncate operation is a bug.
*/
- if (!page_mapped(page)) {
- bool rsv_on_error = !PagePrivate(page);
- /*
- * We must free the huge page and remove
- * from page cache (remove_huge_page) BEFORE
- * removing the region/reserve map
- * (hugetlb_unreserve_pages). In rare out
- * of memory conditions, removal of the
- * region/reserve map could fail. Before
- * free'ing the page, note PagePrivate which
- * is used in case of error.
- */
- remove_huge_page(page);
- freed++;
- if (!truncate_op) {
- if (unlikely(hugetlb_unreserve_pages(
- inode, next,
- next + 1, 1)))
- hugetlb_fix_reserve_counts(
- inode, rsv_on_error);
- }
+ if (unlikely(page_mapped(page))) {
+ BUG_ON(truncate_op);
+
+ i_mmap_lock_write(mapping);
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ next * pages_per_huge_page(h),
+ (next + 1) * pages_per_huge_page(h));
+ i_mmap_unlock_write(mapping);
}
- if (page->index > next)
- next = page->index;
+ lock_page(page);
+ /*
+ * We must free the huge page and remove from page
+ * cache (remove_huge_page) BEFORE removing the
+ * region/reserve map (hugetlb_unreserve_pages). In
+ * rare out of memory conditions, removal of the
+ * region/reserve map could fail. Before free'ing
+ * the page, note PagePrivate which is used in case
+ * of error.
+ */
+ rsv_on_error = !PagePrivate(page);
+ remove_huge_page(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(inode,
+ next, next + 1, 1)))
+ hugetlb_fix_reserve_counts(inode,
+ rsv_on_error);
+ }
- ++next;
unlock_page(page);
-
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
+ ++next;
huge_pagevec_release(&pvec);
+ cond_resched();
}
if (truncate_op)
@@ -450,41 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
clear_inode(inode);
}
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
- struct vm_area_struct *vma;
-
- /*
- * end == 0 indicates that the entire range after
- * start should be unmapped.
- */
- vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
- unsigned long v_offset;
-
- /*
- * Can the expression below overflow on 32-bit arches?
- * No, because the interval tree returns us only those vmas
- * which overlap the truncated area starting at pgoff,
- * and no vma on a 32-bit arch can span beyond the 4GB.
- */
- if (vma->vm_pgoff < start)
- v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
- else
- v_offset = 0;
-
- if (end) {
- end = ((end - start) << PAGE_SHIFT) +
- vma->vm_start + v_offset;
- if (end > vma->vm_end)
- end = vma->vm_end;
- } else
- end = vma->vm_end;
-
- unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
- }
-}
-
static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
@@ -519,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (hole_end > hole_start) {
struct address_space *mapping = inode->i_mapping;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
hugetlb_vmdelete_list(&mapping->i_mmap,
@@ -527,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
hole_end >> PAGE_SHIFT);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, hole_start, hole_end);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
@@ -561,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
start = offset >> hpage_shift;
end = (offset + len + hpage_size - 1) >> hpage_shift;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
error = inode_newsize_ok(inode, offset + len);
@@ -647,11 +658,8 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
inode->i_ctime = CURRENT_TIME;
- spin_lock(&inode->i_lock);
- inode->i_private = NULL;
- spin_unlock(&inode->i_lock);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
@@ -709,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
/*
* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
* be taken from reclaim -- unlike regular filesystems. This needs an
- * annotation because huge_pmd_share() does an allocation under
+ * annotation because huge_pmd_share() does an allocation under hugetlb's
* i_mmap_rwsem.
*/
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
@@ -739,7 +747,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* The policy is initialized here even if we are creating a
* private inode because initialization simply creates an
- * an empty rb tree and calls spin_lock_init(), later when we
+ * an empty rb tree and calls rwlock_init(), later when we
* call mpol_free_shared_policy() it will just return because
* the rb tree will still be empty.
*/
@@ -761,6 +769,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
}
lockdep_annotate_inode_mutex_key(inode);
@@ -1202,7 +1211,6 @@ static struct file_system_type hugetlbfs_fs_type = {
.mount = hugetlbfs_mount,
.kill_sb = kill_litter_super,
};
-MODULE_ALIAS_FS("hugetlbfs");
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1322,7 +1330,7 @@ static int __init init_hugetlbfs_fs(void)
error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
- 0, 0, init_once);
+ 0, SLAB_ACCOUNT, init_once);
if (hugetlbfs_inode_cachep == NULL)
goto out2;
@@ -1356,26 +1364,4 @@ static int __init init_hugetlbfs_fs(void)
out2:
return error;
}
-
-static void __exit exit_hugetlbfs_fs(void)
-{
- struct hstate *h;
- int i;
-
-
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(hugetlbfs_inode_cachep);
- i = 0;
- for_each_hstate(h)
- kern_unmount(hugetlbfs_vfsmount[i++]);
- unregister_filesystem(&hugetlbfs_fs_type);
-}
-
-module_init(init_hugetlbfs_fs)
-module_exit(exit_hugetlbfs_fs)
-
-MODULE_LICENSE("GPL");
+fs_initcall(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index 78a17b8859e1..69b8b526c194 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_rdev = 0;
inode->dirtied_when = 0;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ inode->i_wb_frn_winner = 0;
+ inode->i_wb_frn_avg_time = 0;
+ inode->i_wb_frn_history = 0;
+#endif
+
if (security_inode_alloc(inode))
goto out;
spin_lock_init(&inode->i_lock);
@@ -225,7 +231,7 @@ void __destroy_inode(struct inode *inode)
inode_detach_wb(inode);
security_inode_free(inode);
fsnotify_inode_delete(inode);
- locks_free_lock_context(inode->i_flctx);
+ locks_free_lock_context(inode);
if (!inode->i_nlink) {
WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
atomic_long_dec(&inode->i_sb->s_remove_count);
@@ -495,7 +501,7 @@ void clear_inode(struct inode *inode)
*/
spin_lock_irq(&inode->i_data.tree_lock);
BUG_ON(inode->i_data.nrpages);
- BUG_ON(inode->i_data.nrshadows);
+ BUG_ON(inode->i_data.nrexceptional);
spin_unlock_irq(&inode->i_data.tree_lock);
BUG_ON(!list_empty(&inode->i_data.private_list));
BUG_ON(!(inode->i_state & I_FREEING));
@@ -966,9 +972,9 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
swap(inode1, inode2);
if (inode1 && !S_ISDIR(inode1->i_mode))
- mutex_lock(&inode1->i_mutex);
+ inode_lock(inode1);
if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
+ inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);
@@ -980,9 +986,9 @@ EXPORT_SYMBOL(lock_two_nondirectories);
void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
if (inode1 && !S_ISDIR(inode1->i_mode))
- mutex_unlock(&inode1->i_mutex);
+ inode_unlock(inode1);
if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
- mutex_unlock(&inode2->i_mutex);
+ inode_unlock(inode2);
}
EXPORT_SYMBOL(unlock_two_nondirectories);
@@ -1597,6 +1603,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
/**
* touch_atime - update the access time
* @path: the &struct path to update
+ * @inode: inode to update
*
* Update the accessed time on an inode and mark it for writeback.
* This function automatically handles read only file systems and media,
@@ -1882,7 +1889,7 @@ void __init inode_init(void)
sizeof(struct inode),
0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
/* Hash may have been set up in inode_init_early */
@@ -2027,3 +2034,9 @@ void inode_set_flags(struct inode *inode, unsigned int flags,
new_flags) != old_flags));
}
EXPORT_SYMBOL(inode_set_flags);
+
+void inode_nohighmem(struct inode *inode)
+{
+ mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
+}
+EXPORT_SYMBOL(inode_nohighmem);
diff --git a/fs/internal.h b/fs/internal.h
index 71859c4d0b41..b71deeecea17 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,7 +55,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
/*
* namespace.c
*/
-extern int copy_mount_options(const void __user *, unsigned long *);
+extern void *copy_mount_options(const void __user *);
extern char *copy_mount_string(const void __user *);
extern struct vfsmount *lookup_mnt(struct path *);
@@ -151,3 +151,10 @@ extern void mnt_pin_kill(struct mount *m);
* fs/nsfs.c
*/
extern struct dentry_operations ns_dentry_operations;
+
+/*
+ * fs/ioctl.c
+ */
+extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
+ unsigned long arg);
+extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5d01d2638ca5..116a333e9c77 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/falloc.h>
+#include "internal.h"
#include <asm/ioctls.h>
@@ -32,8 +33,7 @@
*
* Returns 0 on success, -errno on error.
*/
-static long vfs_ioctl(struct file *filp, unsigned int cmd,
- unsigned long arg)
+long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
@@ -215,6 +215,29 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
return error;
}
+static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
+{
+ struct fd src_file = fdget(srcfd);
+ int ret;
+
+ if (!src_file.file)
+ return -EBADF;
+ ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+ fdput(src_file);
+ return ret;
+}
+
+static long ioctl_file_clone_range(struct file *file, void __user *argp)
+{
+ struct file_clone_range args;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+ return ioctl_file_clone(file, args.src_fd, args.src_offset,
+ args.src_length, args.dest_offset);
+}
+
#ifdef CONFIG_BLOCK
static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -411,9 +434,9 @@ int generic_block_fiemap(struct inode *inode,
u64 len, get_block_t *get_block)
{
int ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
EXPORT_SYMBOL(generic_block_fiemap);
@@ -545,6 +568,41 @@ static int ioctl_fsthaw(struct file *filp)
return thaw_super(sb);
}
+static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
+{
+ struct file_dedupe_range __user *argp = arg;
+ struct file_dedupe_range *same = NULL;
+ int ret;
+ unsigned long size;
+ u16 count;
+
+ if (get_user(count, &argp->dest_count)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ size = offsetof(struct file_dedupe_range __user, info[count]);
+
+ same = memdup_user(argp, size);
+ if (IS_ERR(same)) {
+ ret = PTR_ERR(same);
+ same = NULL;
+ goto out;
+ }
+
+ ret = vfs_dedupe_file_range(file, same);
+ if (ret)
+ goto out;
+
+ ret = copy_to_user(argp, same, size);
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(same);
+ return ret;
+}
+
/*
* When you add any new common ioctls to the switches above and below
* please update compat_sys_ioctl() too.
@@ -600,6 +658,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
case FIGETBSZ:
return put_user(inode->i_sb->s_blocksize, argp);
+ case FICLONE:
+ return ioctl_file_clone(filp, arg, 0, 0, 0);
+
+ case FICLONERANGE:
+ return ioctl_file_clone_range(filp, argp);
+
+ case FIDEDUPERANGE:
+ return ioctl_file_dedupe_range(filp, argp);
+
default:
if (S_ISREG(inode->i_mode))
error = file_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index f311bf084015..2e4e834d1a98 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -26,7 +26,7 @@
#include "zisofs.h"
/* This should probably be global. */
-static char zisofs_sink_page[PAGE_CACHE_SIZE];
+static char zisofs_sink_page[PAGE_SIZE];
/*
* This contains the zlib memory allocation and the mutex for the
@@ -70,11 +70,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
for ( i = 0 ; i < pcount ; i++ ) {
if (!pages[i])
continue;
- memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE);
+ memset(page_address(pages[i]), 0, PAGE_SIZE);
flush_dcache_page(pages[i]);
SetPageUptodate(pages[i]);
}
- return ((loff_t)pcount) << PAGE_CACHE_SHIFT;
+ return ((loff_t)pcount) << PAGE_SHIFT;
}
/* Because zlib is not thread-safe, do all the I/O at the top. */
@@ -121,11 +121,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
if (pages[curpage]) {
stream.next_out = page_address(pages[curpage])
+ poffset;
- stream.avail_out = PAGE_CACHE_SIZE - poffset;
+ stream.avail_out = PAGE_SIZE - poffset;
poffset = 0;
} else {
stream.next_out = (void *)&zisofs_sink_page;
- stream.avail_out = PAGE_CACHE_SIZE;
+ stream.avail_out = PAGE_SIZE;
}
}
if (!stream.avail_in) {
@@ -220,14 +220,14 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
* pages with the data we have anyway...
*/
start_off = page_offset(pages[full_page]);
- end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size);
+ end_off = min_t(loff_t, start_off + PAGE_SIZE, inode->i_size);
cstart_block = start_off >> zisofs_block_shift;
cend_block = (end_off + (1 << zisofs_block_shift) - 1)
>> zisofs_block_shift;
- WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) !=
- ((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK));
+ WARN_ON(start_off - (full_page << PAGE_SHIFT) !=
+ ((cstart_block << zisofs_block_shift) & PAGE_MASK));
/* Find the pointer to this specific chunk */
/* Note: we're not using isonum_731() here because the data is known aligned */
@@ -260,10 +260,10 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
ret = zisofs_uncompress_block(inode, block_start, block_end,
pcount, pages, poffset, &err);
poffset += ret;
- pages += poffset >> PAGE_CACHE_SHIFT;
- pcount -= poffset >> PAGE_CACHE_SHIFT;
- full_page -= poffset >> PAGE_CACHE_SHIFT;
- poffset &= ~PAGE_CACHE_MASK;
+ pages += poffset >> PAGE_SHIFT;
+ pcount -= poffset >> PAGE_SHIFT;
+ full_page -= poffset >> PAGE_SHIFT;
+ poffset &= ~PAGE_MASK;
if (err) {
brelse(bh);
@@ -282,7 +282,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
if (poffset && *pages) {
memset(page_address(*pages) + poffset, 0,
- PAGE_CACHE_SIZE - poffset);
+ PAGE_SIZE - poffset);
flush_dcache_page(*pages);
SetPageUptodate(*pages);
}
@@ -302,12 +302,12 @@ static int zisofs_readpage(struct file *file, struct page *page)
int i, pcount, full_page;
unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
unsigned int zisofs_pages_per_cblock =
- PAGE_CACHE_SHIFT <= zisofs_block_shift ?
- (1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0;
+ PAGE_SHIFT <= zisofs_block_shift ?
+ (1 << (zisofs_block_shift - PAGE_SHIFT)) : 0;
struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
pgoff_t index = page->index, end_index;
- end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
/*
* If this page is wholly outside i_size we just return zero;
* do_generic_file_read() will handle this for us
@@ -318,7 +318,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
return 0;
}
- if (PAGE_CACHE_SHIFT <= zisofs_block_shift) {
+ if (PAGE_SHIFT <= zisofs_block_shift) {
/* We have already been given one page, this is the one
we must do. */
full_page = index & (zisofs_pages_per_cblock - 1);
@@ -351,7 +351,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
kunmap(pages[i]);
unlock_page(pages[i]);
if (i != full_page)
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d67a16f2a45d..131dedc920d8 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -94,7 +94,7 @@ static int __init init_inodecache(void)
isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
sizeof(struct iso_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (isofs_inode_cachep == NULL)
return -ENOMEM;
@@ -1021,7 +1021,7 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock,
* the page with useless information without generating any
* I/O errors.
*/
- if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
+ if (b_off > ((inode->i_size + PAGE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
__func__, b_off,
(unsigned long long)inode->i_size);
@@ -1417,6 +1417,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_fop = &isofs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &isofs_symlink_aops;
} else
/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 735d7522a3a9..5384ceb35b1c 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -687,7 +687,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
struct iso_inode_info *ei = ISOFS_I(inode);
struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
- char *link = kmap(page);
+ char *link = page_address(page);
unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
struct buffer_head *bh;
char *rpnt = link;
@@ -774,7 +774,6 @@ repeat:
brelse(bh);
*rpnt = '\0';
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
@@ -791,7 +790,6 @@ fail:
brelse(bh);
error:
SetPageError(page);
- kunmap(page);
unlock_page(page);
return -EIO;
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 36345fefa3ff..2ad98d6e19f4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -81,11 +81,11 @@ static void release_buffer_page(struct buffer_head *bh)
if (!trylock_page(page))
goto nope;
- page_cache_get(page);
+ get_page(page);
__brelse(bh);
try_to_free_buffers(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return;
nope:
@@ -131,14 +131,12 @@ static int journal_submit_commit_record(journal_t *journal,
if (is_journal_aborted(journal))
return 0;
- bh = jbd2_journal_get_descriptor_buffer(journal);
+ bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
+ JBD2_COMMIT_BLOCK);
if (!bh)
return 1;
tmp = (struct commit_header *)bh->b_data;
- tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
- tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
@@ -222,7 +220,7 @@ static int journal_submit_data_buffers(journal_t *journal,
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
mapping = jinode->i_vfs_inode->i_mapping;
- set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+ jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
/*
* submit the inode data buffers. We use writepage
@@ -236,8 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal,
ret = err;
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
- clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
- smp_mb__after_atomic();
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ smp_mb();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
spin_unlock(&journal->j_list_lock);
@@ -258,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
/* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
- set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
+ jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
if (err) {
@@ -274,8 +272,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
ret = err;
}
spin_lock(&journal->j_list_lock);
- clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
- smp_mb__after_atomic();
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ smp_mb();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
@@ -319,22 +317,6 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
-static void jbd2_descr_block_csum_set(journal_t *j,
- struct buffer_head *bh)
-{
- struct jbd2_journal_block_tail *tail;
- __u32 csum;
-
- if (!jbd2_journal_has_csum_v2or3(j))
- return;
-
- tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
- sizeof(struct jbd2_journal_block_tail));
- tail->t_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
- tail->t_checksum = cpu_to_be32(csum);
-}
-
static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
struct buffer_head *bh, __u32 sequence)
{
@@ -379,7 +361,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
ktime_t start_time;
u64 commit_time;
char *tagp = NULL;
- journal_header_t *header;
journal_block_tag_t *tag = NULL;
int space_left = 0;
int first_tag = 0;
@@ -554,8 +535,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd2_journal_abort(journal, err);
blk_start_plug(&plug);
- jbd2_journal_write_revoke_records(journal, commit_transaction,
- &log_bufs, WRITE_SYNC);
+ jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
jbd_debug(3, "JBD2: commit phase 2b\n");
@@ -616,7 +596,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(4, "JBD2: get descriptor\n");
- descriptor = jbd2_journal_get_descriptor_buffer(journal);
+ descriptor = jbd2_journal_get_descriptor_buffer(
+ commit_transaction,
+ JBD2_DESCRIPTOR_BLOCK);
if (!descriptor) {
jbd2_journal_abort(journal, -EIO);
continue;
@@ -625,11 +607,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
(unsigned long long)descriptor->b_blocknr,
descriptor->b_data);
- header = (journal_header_t *)descriptor->b_data;
- header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
- header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
- header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-
tagp = &descriptor->b_data[sizeof(journal_header_t)];
space_left = descriptor->b_size -
sizeof(journal_header_t);
@@ -721,7 +698,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
- jbd2_descr_block_csum_set(journal, descriptor);
+ jbd2_descriptor_block_csum_set(journal, descriptor);
start_journal_io:
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 81e622681c82..435f0b26ac20 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -805,10 +805,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
-struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *
+jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
{
+ journal_t *journal = transaction->t_journal;
struct buffer_head *bh;
unsigned long long blocknr;
+ journal_header_t *header;
int err;
err = jbd2_journal_next_log_block(journal, &blocknr);
@@ -821,12 +824,31 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
return NULL;
lock_buffer(bh);
memset(bh->b_data, 0, journal->j_blocksize);
+ header = (journal_header_t *)bh->b_data;
+ header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+ header->h_blocktype = cpu_to_be32(type);
+ header->h_sequence = cpu_to_be32(transaction->t_tid);
set_buffer_uptodate(bh);
unlock_buffer(bh);
BUFFER_TRACE(bh, "return this buffer");
return bh;
}
+void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
+{
+ struct jbd2_journal_block_tail *tail;
+ __u32 csum;
+
+ if (!jbd2_journal_has_csum_v2or3(j))
+ return;
+
+ tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
+ sizeof(struct jbd2_journal_block_tail));
+ tail->t_checksum = 0;
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ tail->t_checksum = cpu_to_be32(csum);
+}
+
/*
* Return tid of the oldest transaction in the journal and block in the journal
* where the transaction starts.
@@ -1408,11 +1430,12 @@ out:
/**
* jbd2_mark_journal_empty() - Mark on disk journal as empty.
* @journal: The journal to update.
+ * @write_op: With which operation should we write the journal sb
*
* Update a journal's dynamic superblock fields to show that journal is empty.
* Write updated superblock to disk waiting for IO to complete.
*/
-static void jbd2_mark_journal_empty(journal_t *journal)
+static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
{
journal_superblock_t *sb = journal->j_superblock;
@@ -1430,7 +1453,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
sb->s_start = cpu_to_be32(0);
read_unlock(&journal->j_state_lock);
- jbd2_write_superblock(journal, WRITE_FUA);
+ jbd2_write_superblock(journal, write_op);
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
@@ -1716,7 +1739,13 @@ int jbd2_journal_destroy(journal_t *journal)
if (journal->j_sb_buffer) {
if (!is_journal_aborted(journal)) {
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal);
+
+ write_lock(&journal->j_state_lock);
+ journal->j_tail_sequence =
+ ++journal->j_transaction_sequence;
+ write_unlock(&journal->j_state_lock);
+
+ jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
} else
err = -EIO;
@@ -1975,7 +2004,7 @@ int jbd2_journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
- jbd2_mark_journal_empty(journal);
+ jbd2_mark_journal_empty(journal, WRITE_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
J_ASSERT(!journal->j_running_transaction);
@@ -2021,7 +2050,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
if (write) {
/* Lock to make assertions happy... */
mutex_lock(&journal->j_checkpoint_mutex);
- jbd2_mark_journal_empty(journal);
+ jbd2_mark_journal_empty(journal, WRITE_FUA);
mutex_unlock(&journal->j_checkpoint_mutex);
}
@@ -2192,7 +2221,7 @@ void jbd2_journal_ack_err(journal_t *journal)
int jbd2_journal_blocks_per_page(struct inode *inode)
{
- return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
}
/*
@@ -2565,7 +2594,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
restart:
spin_lock(&journal->j_list_lock);
/* Is commit writing out inode - we have to wait */
- if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) {
+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 76579c28edc7..805bc6bcd8ab 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -174,8 +174,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
return 0;
}
-static int jbd2_descr_block_csum_verify(journal_t *j,
- void *buf)
+static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
{
struct jbd2_journal_block_tail *tail;
__be32 provided;
@@ -522,8 +521,8 @@ static int do_one_pass(journal_t *journal,
descr_csum_size =
sizeof(struct jbd2_journal_block_tail);
if (descr_csum_size > 0 &&
- !jbd2_descr_block_csum_verify(journal,
- bh->b_data)) {
+ !jbd2_descriptor_block_csum_verify(journal,
+ bh->b_data)) {
printk(KERN_ERR "JBD2: Invalid checksum "
"recovering block %lu in log\n",
next_log_block);
@@ -811,26 +810,6 @@ static int do_one_pass(journal_t *journal,
return err;
}
-static int jbd2_revoke_block_csum_verify(journal_t *j,
- void *buf)
-{
- struct jbd2_journal_revoke_tail *tail;
- __be32 provided;
- __u32 calculated;
-
- if (!jbd2_journal_has_csum_v2or3(j))
- return 1;
-
- tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
- sizeof(struct jbd2_journal_revoke_tail));
- provided = tail->r_checksum;
- tail->r_checksum = 0;
- calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
- tail->r_checksum = provided;
-
- return provided == cpu_to_be32(calculated);
-}
-
/* Scan a revoke record, marking all blocks mentioned as revoked. */
static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
@@ -846,11 +825,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
offset = sizeof(jbd2_journal_revoke_header_t);
rcount = be32_to_cpu(header->r_count);
- if (!jbd2_revoke_block_csum_verify(journal, header))
+ if (!jbd2_descriptor_block_csum_verify(journal, header))
return -EFSBADCRC;
if (jbd2_journal_has_csum_v2or3(journal))
- csum_size = sizeof(struct jbd2_journal_revoke_tail);
+ csum_size = sizeof(struct jbd2_journal_block_tail);
if (rcount > journal->j_blocksize - csum_size)
return -EINVAL;
max = rcount;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 705ae577882b..91171dc352cb 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,11 +122,11 @@ struct jbd2_revoke_table_s
#ifdef __KERNEL__
-static void write_one_revoke_record(journal_t *, transaction_t *,
+static void write_one_revoke_record(transaction_t *,
struct list_head *,
struct buffer_head **, int *,
- struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
+ struct jbd2_revoke_record_s *);
+static void flush_descriptor(journal_t *, struct buffer_head *, int);
#endif
/* Utility functions to maintain the revoke table */
@@ -519,11 +519,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
* Write revoke records to the journal for all entries in the current
* revoke hash, deleting the entries as we go.
*/
-void jbd2_journal_write_revoke_records(journal_t *journal,
- transaction_t *transaction,
- struct list_head *log_bufs,
- int write_op)
+void jbd2_journal_write_revoke_records(transaction_t *transaction,
+ struct list_head *log_bufs)
{
+ journal_t *journal = transaction->t_journal;
struct buffer_head *descriptor;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
@@ -544,16 +543,15 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s *)
hash_list->next;
- write_one_revoke_record(journal, transaction, log_bufs,
- &descriptor, &offset,
- record, write_op);
+ write_one_revoke_record(transaction, log_bufs,
+ &descriptor, &offset, record);
count++;
list_del(&record->hash);
kmem_cache_free(jbd2_revoke_record_cache, record);
}
}
if (descriptor)
- flush_descriptor(journal, descriptor, offset, write_op);
+ flush_descriptor(journal, descriptor, offset);
jbd_debug(1, "Wrote %d revoke records\n", count);
}
@@ -562,18 +560,16 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
* block if the old one is full or if we have not already created one.
*/
-static void write_one_revoke_record(journal_t *journal,
- transaction_t *transaction,
+static void write_one_revoke_record(transaction_t *transaction,
struct list_head *log_bufs,
struct buffer_head **descriptorp,
int *offsetp,
- struct jbd2_revoke_record_s *record,
- int write_op)
+ struct jbd2_revoke_record_s *record)
{
+ journal_t *journal = transaction->t_journal;
int csum_size = 0;
struct buffer_head *descriptor;
int sz, offset;
- journal_header_t *header;
/* If we are already aborting, this all becomes a noop. We
still need to go round the loop in
@@ -587,7 +583,7 @@ static void write_one_revoke_record(journal_t *journal,
/* Do we need to leave space at the end for a checksum? */
if (jbd2_journal_has_csum_v2or3(journal))
- csum_size = sizeof(struct jbd2_journal_revoke_tail);
+ csum_size = sizeof(struct jbd2_journal_block_tail);
if (jbd2_has_feature_64bit(journal))
sz = 8;
@@ -597,19 +593,16 @@ static void write_one_revoke_record(journal_t *journal,
/* Make sure we have a descriptor with space left for the record */
if (descriptor) {
if (offset + sz > journal->j_blocksize - csum_size) {
- flush_descriptor(journal, descriptor, offset, write_op);
+ flush_descriptor(journal, descriptor, offset);
descriptor = NULL;
}
}
if (!descriptor) {
- descriptor = jbd2_journal_get_descriptor_buffer(journal);
+ descriptor = jbd2_journal_get_descriptor_buffer(transaction,
+ JBD2_REVOKE_BLOCK);
if (!descriptor)
return;
- header = (journal_header_t *)descriptor->b_data;
- header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
- header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
- header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
BUFFER_TRACE(descriptor, "file in log_bufs");
@@ -630,21 +623,6 @@ static void write_one_revoke_record(journal_t *journal,
*offsetp = offset;
}
-static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
-{
- struct jbd2_journal_revoke_tail *tail;
- __u32 csum;
-
- if (!jbd2_journal_has_csum_v2or3(j))
- return;
-
- tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
- sizeof(struct jbd2_journal_revoke_tail));
- tail->r_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
- tail->r_checksum = cpu_to_be32(csum);
-}
-
/*
* Flush a revoke descriptor out to the journal. If we are aborting,
* this is a noop; otherwise we are generating a buffer which needs to
@@ -654,7 +632,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
static void flush_descriptor(journal_t *journal,
struct buffer_head *descriptor,
- int offset, int write_op)
+ int offset)
{
jbd2_journal_revoke_header_t *header;
@@ -665,12 +643,12 @@ static void flush_descriptor(journal_t *journal,
header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
header->r_count = cpu_to_be32(offset);
- jbd2_revoke_csum_set(journal, descriptor);
+ jbd2_descriptor_block_csum_set(journal, descriptor);
set_buffer_jwrite(descriptor);
BUFFER_TRACE(descriptor, "write");
set_buffer_dirty(descriptor);
- write_dirty_buffer(descriptor, write_op);
+ write_dirty_buffer(descriptor, WRITE_SYNC);
}
#endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 98d04c5fe3d2..2c56c3e32194 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -764,13 +764,11 @@ void jbd2_journal_unlock_updates (journal_t *journal)
static void warn_dirty_buffer(struct buffer_head *bh)
{
- char b[BDEVNAME_SIZE];
-
printk(KERN_WARNING
- "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+ "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
"There's a risk of filesystem corruption in case of system "
"crash.\n",
- bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
+ bh->b_bdev, (unsigned long long)bh->b_blocknr);
}
/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
@@ -968,14 +966,8 @@ repeat:
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
jbd_unlock_bh_state(bh);
- frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
- if (!frozen_buffer) {
- printk(KERN_ERR "%s: OOM for frozen_buffer\n",
- __func__);
- JBUFFER_TRACE(jh, "oom!");
- error = -ENOMEM;
- goto out;
- }
+ frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
+ GFP_NOFS | __GFP_NOFAIL);
goto repeat;
}
jh->b_frozen_data = frozen_buffer;
@@ -1009,7 +1001,8 @@ out:
}
/* Fast check whether buffer is already attached to the required transaction */
-static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh)
+static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
+ bool undo)
{
struct journal_head *jh;
bool ret = false;
@@ -1036,6 +1029,9 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh)
jh = READ_ONCE(bh->b_private);
if (!jh)
goto out;
+ /* For undo access buffer must have data copied */
+ if (undo && !jh->b_committed_data)
+ goto out;
if (jh->b_transaction != handle->h_transaction &&
jh->b_next_transaction != handle->h_transaction)
goto out;
@@ -1073,7 +1069,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int rc;
- if (jbd2_write_access_granted(handle, bh))
+ if (jbd2_write_access_granted(handle, bh, false))
return 0;
jh = jbd2_journal_add_journal_head(bh);
@@ -1210,7 +1206,7 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
char *committed_data = NULL;
JBUFFER_TRACE(jh, "entry");
- if (jbd2_write_access_granted(handle, bh))
+ if (jbd2_write_access_granted(handle, bh, true))
return 0;
jh = jbd2_journal_add_journal_head(bh);
@@ -1224,15 +1220,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
goto out;
repeat:
- if (!jh->b_committed_data) {
- committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
- if (!committed_data) {
- printk(KERN_ERR "%s: No memory for committed data\n",
- __func__);
- err = -ENOMEM;
- goto out;
- }
- }
+ if (!jh->b_committed_data)
+ committed_data = jbd2_alloc(jh2bh(jh)->b_size,
+ GFP_NOFS|__GFP_NOFAIL);
jbd_lock_bh_state(bh);
if (!jh->b_committed_data) {
@@ -1937,8 +1927,8 @@ out:
* @journal: journal for operation
* @page: to try and free
* @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
+ * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
+ * code to release the buffers.
*
*
* For all the buffers on this page,
@@ -2152,6 +2142,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
if (!buffer_dirty(bh)) {
/* bdflush has written it. We can drop it now */
+ __jbd2_journal_remove_checkpoint(jh);
goto zap_buffer;
}
@@ -2181,6 +2172,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
clear_buffer_jbddirty(bh);
+ __jbd2_journal_remove_checkpoint(jh);
goto zap_buffer;
}
}
@@ -2271,7 +2263,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
struct buffer_head *head, *bh, *next;
unsigned int stop = offset + length;
unsigned int curr_off = 0;
- int partial_page = (offset || length < PAGE_CACHE_SIZE);
+ int partial_page = (offset || length < PAGE_SIZE);
int may_free = 1;
int ret = 0;
@@ -2280,7 +2272,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return 0;
- BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+ BUG_ON(stop > PAGE_SIZE || stop < length);
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index 3ea36554107f..8918ac905a3b 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -2,10 +2,6 @@
JFFS2 LOCKING DOCUMENTATION
---------------------------
-At least theoretically, JFFS2 does not require the Big Kernel Lock
-(BKL), which was always helpfully obtained for it by Linux 2.4 VFS
-code. It has its own locking, as described below.
-
This document attempts to describe the existing locking rules for
JFFS2. It is not expected to remain perfectly up to date, but ought to
be fairly close.
@@ -69,6 +65,7 @@ Ordering constraints:
any f->sem held.
2. Never attempt to lock two file mutexes in one thread.
No ordering rules have been made for doing so.
+ 3. Never lock a page cache page with f->sem held.
erase_completion_lock spinlock
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index bb9cebc9ca8a..e5c1783ab64a 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -80,7 +80,6 @@ static int jffs2_garbage_collect_thread(void *_c)
siginitset(&hupmask, sigmask(SIGHUP));
allow_signal(SIGKILL);
allow_signal(SIGSTOP);
- allow_signal(SIGCONT);
allow_signal(SIGHUP);
c->gc_task = current;
@@ -121,20 +120,18 @@ static int jffs2_garbage_collect_thread(void *_c)
/* Put_super will send a SIGKILL and then wait on the sem.
*/
while (signal_pending(current) || freezing(current)) {
- siginfo_t info;
unsigned long signr;
if (try_to_freeze())
goto again;
- signr = dequeue_signal_lock(current, &current->blocked, &info);
+ signr = kernel_dequeue_signal(NULL);
switch(signr) {
case SIGSTOP:
jffs2_dbg(1, "%s(): SIGSTOP received\n",
__func__);
- set_current_state(TASK_STOPPED);
- schedule();
+ kernel_signal_stop();
break;
case SIGKILL:
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a3750f902adc..b288c8ae1236 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mtd/mtd.h>
+#include <linux/mm.h> /* kvfree() */
#include "nodelist.h"
static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
@@ -49,7 +50,8 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c)
static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
- struct jffs2_inode_cache *ic)
+ struct jffs2_inode_cache *ic,
+ int *dir_hardlinks)
{
struct jffs2_full_dirent *fd;
@@ -68,19 +70,21 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n",
fd->name, fd->ino, ic->ino);
jffs2_mark_node_obsolete(c, fd->raw);
+ /* Clear the ic/raw union so it doesn't cause problems later. */
+ fd->ic = NULL;
continue;
}
+ /* From this point, fd->raw is no longer used so we can set fd->ic */
+ fd->ic = child_ic;
+ child_ic->pino_nlink++;
+ /* If we appear (at this stage) to have hard-linked directories,
+ * set a flag to trigger a scan later */
if (fd->type == DT_DIR) {
- if (child_ic->pino_nlink) {
- JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n",
- fd->name, fd->ino, ic->ino);
- /* TODO: What do we do about it? */
- } else {
- child_ic->pino_nlink = ic->ino;
- }
- } else
- child_ic->pino_nlink++;
+ child_ic->flags |= INO_FLAGS_IS_DIR;
+ if (child_ic->pino_nlink > 1)
+ *dir_hardlinks = 1;
+ }
dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino);
/* Can't free scan_dents so far. We might need them in pass 2 */
@@ -94,8 +98,7 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
*/
static int jffs2_build_filesystem(struct jffs2_sb_info *c)
{
- int ret;
- int i;
+ int ret, i, dir_hardlinks = 0;
struct jffs2_inode_cache *ic;
struct jffs2_full_dirent *fd;
struct jffs2_full_dirent *dead_fds = NULL;
@@ -119,7 +122,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
/* Now scan the directory tree, increasing nlink according to every dirent found. */
for_each_inode(i, c, ic) {
if (ic->scan_dents) {
- jffs2_build_inode_pass1(c, ic);
+ jffs2_build_inode_pass1(c, ic, &dir_hardlinks);
cond_resched();
}
}
@@ -155,6 +158,20 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
}
dbg_fsbuild("pass 2a complete\n");
+
+ if (dir_hardlinks) {
+ /* If we detected directory hardlinks earlier, *hopefully*
+ * they are gone now because some of the links were from
+ * dead directories which still had some old dirents lying
+ * around and not yet garbage-collected, but which have
+ * been discarded above. So clear the pino_nlink field
+ * in each directory, so that the final scan below can
+ * print appropriate warnings. */
+ for_each_inode(i, c, ic) {
+ if (ic->flags & INO_FLAGS_IS_DIR)
+ ic->pino_nlink = 0;
+ }
+ }
dbg_fsbuild("freeing temporary data structures\n");
/* Finally, we can scan again and free the dirent structs */
@@ -162,6 +179,33 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
while(ic->scan_dents) {
fd = ic->scan_dents;
ic->scan_dents = fd->next;
+ /* We do use the pino_nlink field to count nlink of
+ * directories during fs build, so set it to the
+ * parent ino# now. Now that there's hopefully only
+ * one. */
+ if (fd->type == DT_DIR) {
+ if (!fd->ic) {
+ /* We'll have complained about it and marked the coresponding
+ raw node obsolete already. Just skip it. */
+ continue;
+ }
+
+ /* We *have* to have set this in jffs2_build_inode_pass1() */
+ BUG_ON(!(fd->ic->flags & INO_FLAGS_IS_DIR));
+
+ /* We clear ic->pino_nlink ∀ directories' ic *only* if dir_hardlinks
+ * is set. Otherwise, we know this should never trigger anyway, so
+ * we don't do the check. And ic->pino_nlink still contains the nlink
+ * value (which is 1). */
+ if (dir_hardlinks && fd->ic->pino_nlink) {
+ JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u is also hard linked from dir ino #%u\n",
+ fd->name, fd->ino, ic->ino, fd->ic->pino_nlink);
+ /* Should we unlink it from its previous parent? */
+ }
+
+ /* For directories, ic->pino_nlink holds that parent inode # */
+ fd->ic->pino_nlink = ic->ino;
+ }
jffs2_free_full_dirent(fd);
}
ic->scan_dents = NULL;
@@ -240,11 +284,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c,
/* Reduce nlink of the child. If it's now zero, stick it on the
dead_fds list to be cleaned up later. Else just free the fd */
-
- if (fd->type == DT_DIR)
- child_ic->pino_nlink = 0;
- else
- child_ic->pino_nlink--;
+ child_ic->pino_nlink--;
if (!child_ic->pino_nlink) {
dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n",
@@ -383,12 +423,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
return 0;
out_free:
-#ifndef __ECOS
- if (jffs2_blocks_use_vmalloc(c))
- vfree(c->blocks);
- else
-#endif
- kfree(c->blocks);
+ kvfree(c->blocks);
return ret;
}
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 1090eb64b90d..9d26b1b9fc01 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -95,15 +95,15 @@ __jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f)
rather than mucking around with actually reading the node
and checking the compression type, which is the real way
to tell a hole node. */
- if (frag->ofs & (PAGE_CACHE_SIZE-1) && frag_prev(frag)
- && frag_prev(frag)->size < PAGE_CACHE_SIZE && frag_prev(frag)->node) {
+ if (frag->ofs & (PAGE_SIZE-1) && frag_prev(frag)
+ && frag_prev(frag)->size < PAGE_SIZE && frag_prev(frag)->node) {
JFFS2_ERROR("REF_PRISTINE node at 0x%08x had a previous non-hole frag in the same page. Tell dwmw2.\n",
ref_offset(fn->raw));
bitched = 1;
}
- if ((frag->ofs+frag->size) & (PAGE_CACHE_SIZE-1) && frag_next(frag)
- && frag_next(frag)->size < PAGE_CACHE_SIZE && frag_next(frag)->node) {
+ if ((frag->ofs+frag->size) & (PAGE_SIZE-1) && frag_next(frag)
+ && frag_next(frag)->size < PAGE_SIZE && frag_next(frag)->node) {
JFFS2_ERROR("REF_PRISTINE node at 0x%08x (%08x-%08x) had a following non-hole frag in the same page. Tell dwmw2.\n",
ref_offset(fn->raw), frag->ofs, frag->ofs+frag->size);
bitched = 1;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index d211b8e18566..30c4c9ebb693 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
__func__, ret);
- /* Might as well let the VFS know */
- d_instantiate(new_dentry, d_inode(old_dentry));
- ihold(d_inode(old_dentry));
+ /*
+ * We can't keep the target in dcache after that.
+ * For one thing, we can't afford dentry aliases for directories.
+ * For another, if there was a victim, we _can't_ set new inode
+ * for that sucker and we have to trigger mount eviction - the
+ * caller won't do it on its own since we are returning an error.
+ */
+ d_invalidate(new_dentry);
new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
return ret;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index f509f62e12f6..0e62dec3effc 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -39,10 +39,10 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* Trigger GC to flush any pending writes for this inode */
jffs2_flush_wbuf_gc(c, inode->i_ino);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return 0;
}
@@ -87,14 +87,15 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
int ret;
jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n",
- __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT);
+ __func__, inode->i_ino, pg->index << PAGE_SHIFT);
BUG_ON(!PageLocked(pg));
pg_buf = kmap(pg);
/* FIXME: Can kmap fail? */
- ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE);
+ ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT,
+ PAGE_SIZE);
if (ret) {
ClearPageUptodate(pg);
@@ -137,39 +138,33 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct page *pg;
struct inode *inode = mapping->host;
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
- struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
- struct jffs2_raw_inode ri;
- uint32_t alloc_len = 0;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- uint32_t pageofs = index << PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
+ uint32_t pageofs = index << PAGE_SHIFT;
int ret = 0;
- jffs2_dbg(1, "%s()\n", __func__);
-
- if (pageofs > inode->i_size) {
- ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
- ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
- if (ret)
- return ret;
- }
-
- mutex_lock(&f->sem);
pg = grab_cache_page_write_begin(mapping, index, flags);
- if (!pg) {
- if (alloc_len)
- jffs2_complete_reservation(c);
- mutex_unlock(&f->sem);
+ if (!pg)
return -ENOMEM;
- }
*pagep = pg;
- if (alloc_len) {
+ jffs2_dbg(1, "%s()\n", __func__);
+
+ if (pageofs > inode->i_size) {
/* Make new hole frag from old EOF to new page */
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+ struct jffs2_raw_inode ri;
struct jffs2_full_dnode *fn;
+ uint32_t alloc_len;
jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
(unsigned int)inode->i_size, pageofs);
+ ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+ ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+ if (ret)
+ goto out_page;
+
+ mutex_lock(&f->sem);
memset(&ri, 0, sizeof(ri));
ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -196,6 +191,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
if (IS_ERR(fn)) {
ret = PTR_ERR(fn);
jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
goto out_page;
}
ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -210,10 +206,12 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
jffs2_mark_node_obsolete(c, fn->raw);
jffs2_free_full_dnode(fn);
jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
goto out_page;
}
jffs2_complete_reservation(c);
inode->i_size = pageofs;
+ mutex_unlock(&f->sem);
}
/*
@@ -222,18 +220,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
* case of a short-copy.
*/
if (!PageUptodate(pg)) {
+ mutex_lock(&f->sem);
ret = jffs2_do_readpage_nolock(inode, pg);
+ mutex_unlock(&f->sem);
if (ret)
goto out_page;
}
- mutex_unlock(&f->sem);
jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
return ret;
out_page:
unlock_page(pg);
- page_cache_release(pg);
- mutex_unlock(&f->sem);
+ put_page(pg);
return ret;
}
@@ -248,14 +246,14 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
struct jffs2_raw_inode *ri;
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned start = pos & (PAGE_SIZE - 1);
unsigned end = start + copied;
unsigned aligned_start = start & ~3;
int ret = 0;
uint32_t writtenlen = 0;
jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
- __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT,
+ __func__, inode->i_ino, pg->index << PAGE_SHIFT,
start, end, pg->flags);
/* We need to avoid deadlock with page_cache_read() in
@@ -264,7 +262,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
to re-lock it. */
BUG_ON(!PageUptodate(pg));
- if (end == PAGE_CACHE_SIZE) {
+ if (end == PAGE_SIZE) {
/* When writing out the end of a page, write out the
_whole_ page. This helps to reduce the number of
nodes in files which have many short writes, like
@@ -278,7 +276,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
jffs2_dbg(1, "%s(): Allocation of raw inode failed\n",
__func__);
unlock_page(pg);
- page_cache_release(pg);
+ put_page(pg);
return -ENOMEM;
}
@@ -295,7 +293,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
kmap(pg);
ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start,
- (pg->index << PAGE_CACHE_SHIFT) + aligned_start,
+ (pg->index << PAGE_SHIFT) + aligned_start,
end - aligned_start, &writtenlen);
kunmap(pg);
@@ -332,6 +330,6 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
jffs2_dbg(1, "%s() returning %d\n",
__func__, writtenlen > 0 ? writtenlen : ret);
unlock_page(pg);
- page_cache_release(pg);
+ put_page(pg);
return writtenlen > 0 ? writtenlen : ret;
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 2caf1682036d..ae2ebb26b446 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -586,8 +586,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
goto out_root;
sb->s_maxbytes = 0xFFFFFFFF;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = JFFS2_SUPER_MAGIC;
if (!(sb->s_flags & MS_RDONLY))
jffs2_start_garbage_collect_thread(c);
@@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
out_root:
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
- if (jffs2_blocks_use_vmalloc(c))
- vfree(c->blocks);
- else
- kfree(c->blocks);
+ kvfree(c->blocks);
out_inohash:
jffs2_clear_xattr_subsystem(c);
kfree(c->inocache_list);
@@ -688,7 +685,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
struct inode *inode = OFNI_EDONI_2SFFJ(f);
struct page *pg;
- pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+ pg = read_cache_page(inode->i_mapping, offset >> PAGE_SHIFT,
(void *)jffs2_do_readpage_unlock, inode);
if (IS_ERR(pg))
return (void *)pg;
@@ -704,7 +701,7 @@ void jffs2_gc_release_page(struct jffs2_sb_info *c,
struct page *pg = (void *)*priv;
kunmap(pg);
- page_cache_release(pg);
+ put_page(pg);
}
static int jffs2_flash_setup(struct jffs2_sb_info *c) {
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 5a2dec2b064c..9ed0f26cf023 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -134,37 +134,59 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
if (mutex_lock_interruptible(&c->alloc_sem))
return -EINTR;
+
for (;;) {
+ /* We can't start doing GC until we've finished checking
+ the node CRCs etc. */
+ int bucket, want_ino;
+
spin_lock(&c->erase_completion_lock);
if (!c->unchecked_size)
break;
-
- /* We can't start doing GC yet. We haven't finished checking
- the node CRCs etc. Do it now. */
-
- /* checked_ino is protected by the alloc_sem */
- if (c->checked_ino > c->highest_ino && xattr) {
- pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
- c->unchecked_size);
- jffs2_dbg_dump_block_lists_nolock(c);
- spin_unlock(&c->erase_completion_lock);
- mutex_unlock(&c->alloc_sem);
- return -ENOSPC;
- }
-
spin_unlock(&c->erase_completion_lock);
if (!xattr)
xattr = jffs2_verify_xattr(c);
spin_lock(&c->inocache_lock);
+ /* Instead of doing the inodes in numeric order, doing a lookup
+ * in the hash for each possible number, just walk the hash
+ * buckets of *existing* inodes. This means that we process
+ * them out-of-order, but it can be a lot faster if there's
+ * a sparse inode# space. Which there often is. */
+ want_ino = c->check_ino;
+ for (bucket = c->check_ino % c->inocache_hashsize ; bucket < c->inocache_hashsize; bucket++) {
+ for (ic = c->inocache_list[bucket]; ic; ic = ic->next) {
+ if (ic->ino < want_ino)
+ continue;
+
+ if (ic->state != INO_STATE_CHECKEDABSENT &&
+ ic->state != INO_STATE_PRESENT)
+ goto got_next; /* with inocache_lock held */
+
+ jffs2_dbg(1, "Skipping ino #%u already checked\n",
+ ic->ino);
+ }
+ want_ino = 0;
+ }
- ic = jffs2_get_ino_cache(c, c->checked_ino++);
+ /* Point c->check_ino past the end of the last bucket. */
+ c->check_ino = ((c->highest_ino + c->inocache_hashsize + 1) &
+ ~c->inocache_hashsize) - 1;
- if (!ic) {
- spin_unlock(&c->inocache_lock);
- continue;
- }
+ spin_unlock(&c->inocache_lock);
+
+ pr_crit("Checked all inodes but still 0x%x bytes of unchecked space?\n",
+ c->unchecked_size);
+ jffs2_dbg_dump_block_lists_nolock(c);
+ mutex_unlock(&c->alloc_sem);
+ return -ENOSPC;
+
+ got_next:
+ /* For next time round the loop, we want c->checked_ino to indicate
+ * the *next* one we want to check. And since we're walking the
+ * buckets rather than doing it sequentially, it's: */
+ c->check_ino = ic->ino + c->inocache_hashsize;
if (!ic->pino_nlink) {
jffs2_dbg(1, "Skipping check of ino #%d with nlink/pino zero\n",
@@ -176,8 +198,6 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
switch(ic->state) {
case INO_STATE_CHECKEDABSENT:
case INO_STATE_PRESENT:
- jffs2_dbg(1, "Skipping ino #%u already checked\n",
- ic->ino);
spin_unlock(&c->inocache_lock);
continue;
@@ -196,7 +216,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
ic->ino);
/* We need to come back again for the _same_ inode. We've
made no progress in this case, but that should be OK */
- c->checked_ino--;
+ c->check_ino = ic->ino;
mutex_unlock(&c->alloc_sem);
sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
@@ -532,7 +552,7 @@ static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_era
goto upnout;
}
/* We found a datanode. Do the GC */
- if((start >> PAGE_CACHE_SHIFT) < ((end-1) >> PAGE_CACHE_SHIFT)) {
+ if((start >> PAGE_SHIFT) < ((end-1) >> PAGE_SHIFT)) {
/* It crosses a page boundary. Therefore, it must be a hole. */
ret = jffs2_garbage_collect_hole(c, jeb, f, fn, start, end);
} else {
@@ -1172,8 +1192,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
struct jffs2_node_frag *frag;
uint32_t min, max;
- min = start & ~(PAGE_CACHE_SIZE-1);
- max = min + PAGE_CACHE_SIZE;
+ min = start & ~(PAGE_SIZE-1);
+ max = min + PAGE_SIZE;
frag = jffs2_lookup_node_frag(&f->fragtree, start);
@@ -1296,14 +1316,17 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
BUG_ON(start > orig_start);
}
- /* First, use readpage() to read the appropriate page into the page cache */
- /* Q: What happens if we actually try to GC the _same_ page for which commit_write()
- * triggered garbage collection in the first place?
- * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the
- * page OK. We'll actually write it out again in commit_write, which is a little
- * suboptimal, but at least we're correct.
- */
+ /* The rules state that we must obtain the page lock *before* f->sem, so
+ * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's
+ * actually going to *change* so we're safe; we only allow reading.
+ *
+ * It is important to note that jffs2_write_begin() will ensure that its
+ * page is marked Uptodate before allocating space. That means that if we
+ * end up here trying to GC the *same* page that jffs2_write_begin() is
+ * trying to write out, read_cache_page() will not deadlock. */
+ mutex_unlock(&f->sem);
pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
+ mutex_lock(&f->sem);
if (IS_ERR(pg_ptr)) {
pr_warn("read_cache_page() returned error: %ld\n",
@@ -1328,7 +1351,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
cdatalen = min_t(uint32_t, alloclen - sizeof(ri), end - offset);
datalen = end - offset;
- writebuf = pg_ptr + (offset & (PAGE_CACHE_SIZE -1));
+ writebuf = pg_ptr + (offset & (PAGE_SIZE -1));
comprtype = jffs2_compress(c, f, writebuf, &comprbuf, &datalen, &cdatalen);
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 046fee8b6e9b..778275f48a87 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -49,7 +49,7 @@ struct jffs2_sb_info {
struct mtd_info *mtd;
uint32_t highest_ino;
- uint32_t checked_ino;
+ uint32_t check_ino; /* *NEXT* inode to be checked */
unsigned int flags;
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 9a5449bc3afb..b86c78d178c6 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -90,7 +90,7 @@ uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list,
/* If the last fragment starts at the RAM page boundary, it is
* REF_PRISTINE irrespective of its size. */
- if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) {
+ if (frag->node && (frag->ofs & (PAGE_SIZE - 1)) == 0) {
dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n",
frag->ofs, frag->ofs + frag->size);
frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE;
@@ -237,7 +237,7 @@ static int jffs2_add_frag_to_fragtree(struct jffs2_sb_info *c, struct rb_root *r
If so, both 'this' and the new node get marked REF_NORMAL so
the GC can take a look.
*/
- if (lastend && (lastend-1) >> PAGE_CACHE_SHIFT == newfrag->ofs >> PAGE_CACHE_SHIFT) {
+ if (lastend && (lastend-1) >> PAGE_SHIFT == newfrag->ofs >> PAGE_SHIFT) {
if (this->node)
mark_ref_normal(this->node->raw);
mark_ref_normal(newfrag->node->raw);
@@ -382,7 +382,7 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in
/* If we now share a page with other nodes, mark either previous
or next node REF_NORMAL, as appropriate. */
- if (newfrag->ofs & (PAGE_CACHE_SIZE-1)) {
+ if (newfrag->ofs & (PAGE_SIZE-1)) {
struct jffs2_node_frag *prev = frag_prev(newfrag);
mark_ref_normal(fn->raw);
@@ -391,7 +391,7 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in
mark_ref_normal(prev->node->raw);
}
- if ((newfrag->ofs+newfrag->size) & (PAGE_CACHE_SIZE-1)) {
+ if ((newfrag->ofs+newfrag->size) & (PAGE_SIZE-1)) {
struct jffs2_node_frag *next = frag_next(newfrag);
if (next) {
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index fa35ff79ab35..0637271f3770 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -194,6 +194,7 @@ struct jffs2_inode_cache {
#define INO_STATE_CLEARING 6 /* In clear_inode() */
#define INO_FLAGS_XATTR_CHECKED 0x01 /* has no duplicate xattr_ref */
+#define INO_FLAGS_IS_DIR 0x02 /* is a directory */
#define RAWNODE_CLASS_INODE_CACHE 0
#define RAWNODE_CLASS_XATTR_DATUM 1
@@ -249,7 +250,10 @@ struct jffs2_readinode_info
struct jffs2_full_dirent
{
- struct jffs2_raw_node_ref *raw;
+ union {
+ struct jffs2_raw_node_ref *raw;
+ struct jffs2_inode_cache *ic; /* Just during part of build */
+ };
struct jffs2_full_dirent *next;
uint32_t version;
uint32_t ino; /* == zero for unlink */
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index b6bd4affd9ad..cda0774c2c9c 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -846,8 +846,8 @@ int jffs2_thread_should_wake(struct jffs2_sb_info *c)
return 1;
if (c->unchecked_size) {
- jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, checked_ino #%d\n",
- c->unchecked_size, c->checked_ino);
+ jffs2_dbg(1, "jffs2_thread_should_wake(): unchecked_size %d, check_ino #%d\n",
+ c->unchecked_size, c->check_ino);
return 1;
}
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index d4b43fb7adb1..7a28facd7175 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -48,42 +48,24 @@ int jffs2_init_security(struct inode *inode, struct inode *dir,
}
/* ---- XATTR Handler for "security.*" ----------------- */
-static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_security_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
name, buffer, size);
}
-static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_security_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
-
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
name, buffer, size, flags);
}
-static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
-
- if (list && retlen <= list_size) {
- strcpy(list, XATTR_SECURITY_PREFIX);
- strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
- }
-
- return retlen;
-}
-
const struct xattr_handler jffs2_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = jffs2_security_listxattr,
.set = jffs2_security_setxattr,
.get = jffs2_security_getxattr
};
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d86c5e3176a1..0a9a114bb9d1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb)
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
- if (jffs2_blocks_use_vmalloc(c))
- vfree(c->blocks);
- else
- kfree(c->blocks);
+ kvfree(c->blocks);
jffs2_flash_cleanup(c);
kfree(c->inocache_list);
jffs2_clear_xattr_subsystem(c);
@@ -387,7 +384,7 @@ static int __init init_jffs2_fs(void)
jffs2_inode_cachep = kmem_cache_create("jffs2_i",
sizeof(struct jffs2_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
jffs2_i_init_once);
if (!jffs2_inode_cachep) {
pr_err("error: Failed to initialise inode cache\n");
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 8ce2f240125b..2cabd649d4fb 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -14,7 +14,7 @@
const struct inode_operations jffs2_symlink_inode_operations =
{
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 63f31c0733c5..b25d28a21212 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1153,7 +1153,7 @@ static struct jffs2_sb_info *work_to_sb(struct work_struct *work)
{
struct delayed_work *dwork;
- dwork = container_of(work, struct delayed_work, work);
+ dwork = to_delayed_work(work);
return container_of(dwork, struct jffs2_sb_info, wbuf_dwork);
}
@@ -1183,22 +1183,20 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c)
int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
{
- struct nand_ecclayout *oinfo = c->mtd->ecclayout;
-
if (!c->mtd->oobsize)
return 0;
/* Cleanmarker is out-of-band, so inline size zero */
c->cleanmarker_size = 0;
- if (!oinfo || oinfo->oobavail == 0) {
+ if (c->mtd->oobavail == 0) {
pr_err("inconsistent device description\n");
return -EINVAL;
}
jffs2_dbg(1, "using OOB on NAND\n");
- c->oobavail = oinfo->oobavail;
+ c->oobavail = c->mtd->oobavail;
/* Initialise write buffer */
init_rwsem(&c->wbuf_sem);
@@ -1264,7 +1262,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
if ((c->flash_size % c->sector_size) != 0) {
c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
- };
+ }
c->wbuf_ofs = 0xFFFFFFFF;
c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index b634de4c8101..7fb187ab2682 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -172,8 +172,8 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
beginning of a page and runs to the end of the file, or if
it's a hole node, mark it REF_PRISTINE, else REF_NORMAL.
*/
- if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) ||
- ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) &&
+ if ((je32_to_cpu(ri->dsize) >= PAGE_SIZE) ||
+ ( ((je32_to_cpu(ri->offset)&(PAGE_SIZE-1))==0) &&
(je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) == je32_to_cpu(ri->isize)))) {
flash_ofs |= REF_PRISTINE;
} else {
@@ -366,7 +366,8 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
break;
}
mutex_lock(&f->sem);
- datalen = min_t(uint32_t, writelen, PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1)));
+ datalen = min_t(uint32_t, writelen,
+ PAGE_SIZE - (offset & (PAGE_SIZE-1)));
cdatalen = min_t(uint32_t, alloclen - sizeof(*ri), datalen);
comprtype = jffs2_compress(c, f, buf, &comprbuf, &datalen, &cdatalen);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index f092fee5be50..da3e18503c65 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -967,7 +967,8 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct jffs2_xattr_ref *ref, **pref;
struct jffs2_xattr_datum *xd;
const struct xattr_handler *xhandle;
- ssize_t len, rc;
+ const char *prefix;
+ ssize_t prefix_len, len, rc;
int retry = 0;
rc = check_xattr_ref_inode(c, ic);
@@ -998,17 +999,23 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
}
}
xhandle = xprefix_to_handler(xd->xprefix);
- if (!xhandle)
+ if (!xhandle || (xhandle->list && !xhandle->list(dentry)))
continue;
+ prefix = xhandle->prefix ?: xhandle->name;
+ prefix_len = strlen(prefix);
+ rc = prefix_len + xd->name_len + 1;
+
if (buffer) {
- rc = xhandle->list(dentry, buffer+len, size-len,
- xd->xname, xd->name_len, xd->flags);
- } else {
- rc = xhandle->list(dentry, NULL, 0, xd->xname,
- xd->name_len, xd->flags);
+ if (rc > size - len) {
+ rc = -ERANGE;
+ goto out;
+ }
+ memcpy(buffer, prefix, prefix_len);
+ buffer += prefix_len;
+ memcpy(buffer, xd->xname, xd->name_len);
+ buffer += xd->name_len;
+ *buffer++ = 0;
}
- if (rc < 0)
- goto out;
len += rc;
}
rc = len;
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index ceaf9c693225..b2555ef07a12 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -16,35 +16,25 @@
#include <linux/mtd/mtd.h>
#include "nodelist.h"
-static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
name, buffer, size);
}
-static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
name, buffer, size, flags);
}
-static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+static bool jffs2_trusted_listxattr(struct dentry *dentry)
{
- size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
-
- if (list && retlen<=list_size) {
- strcpy(list, XATTR_TRUSTED_PREFIX);
- strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
- }
-
- return retlen;
+ return capable(CAP_SYS_ADMIN);
}
const struct xattr_handler jffs2_trusted_xattr_handler = {
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index a71391eba514..539bd630b5e4 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -16,40 +16,24 @@
#include <linux/mtd/mtd.h>
#include "nodelist.h"
-static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_user_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
name, buffer, size);
}
-static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_user_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
- if (!strcmp(name, ""))
- return -EINVAL;
return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
name, buffer, size, flags);
}
-static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
-
- if (list && retlen <= list_size) {
- strcpy(list, XATTR_USER_PREFIX);
- strcpy(list + XATTR_USER_PREFIX_LEN, name);
- }
-
- return retlen;
-}
-
const struct xattr_handler jffs2_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = jffs2_user_listxattr,
.set = jffs2_user_setxattr,
.get = jffs2_user_getxattr
};
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 0c8ca830b113..49456853e9de 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -40,10 +40,10 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
switch(type) {
case ACL_TYPE_ACCESS:
- ea_name = POSIX_ACL_XATTR_ACCESS;
+ ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- ea_name = POSIX_ACL_XATTR_DEFAULT;
+ ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return ERR_PTR(-EINVAL);
@@ -82,7 +82,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
switch (type) {
case ACL_TYPE_ACCESS:
- ea_name = POSIX_ACL_XATTR_ACCESS;
+ ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
rc = posix_acl_equiv_mode(acl, &inode->i_mode);
if (rc < 0)
@@ -94,7 +94,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
}
break;
case ACL_TYPE_DEFAULT:
- ea_name = POSIX_ACL_XATTR_DEFAULT;
+ ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
return -EINVAL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 0e026a7bdcd4..4ce7735dd042 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -38,17 +38,17 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (rc)
return rc;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (!(inode->i_state & I_DIRTY_ALL) ||
(datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
/* Make sure committed changes hit the disk */
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return rc;
}
rc |= jfs_commit_inode(inode, 1);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return rc ? -EIO : 0;
}
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 41aa3ca6a6a4..9d9bae63ae2a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -60,6 +60,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
} else if (S_ISLNK(inode->i_mode)) {
if (inode->i_size >= IDATASIZE) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &jfs_aops;
} else {
inode->i_op = &jfs_fast_symlink_inode_operations;
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 8db8b7d61e40..8653cac7e12e 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -96,7 +96,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
/* Lock against other parallel changes of flags */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
jfs_get_inode_flags(jfs_inode);
oldflags = jfs_inode->mode2;
@@ -109,7 +109,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
((flags ^ oldflags) &
(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
err = -EPERM;
goto setflags_out;
}
@@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
jfs_inode->mode2 = flags;
jfs_set_inode_flags(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
setflags_out:
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a69bdf2a1085..a270cb7ff4e0 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1835,17 +1835,16 @@ static int lbmLogInit(struct jfs_log * log)
for (i = 0; i < LOGPAGES;) {
char *buffer;
uint offset;
- struct page *page;
+ struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- buffer = (char *) get_zeroed_page(GFP_KERNEL);
- if (buffer == NULL)
+ if (!page)
goto error;
- page = virt_to_page(buffer);
+ buffer = page_address(page);
for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) {
lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
if (lbuf == NULL) {
if (offset == 0)
- free_page((unsigned long) buffer);
+ __free_page(page);
goto error;
}
if (offset) /* we already have one reference */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index a3eb316b1ac3..b60e015cc757 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -80,7 +80,7 @@ static inline void lock_metapage(struct metapage *mp)
static struct kmem_cache *metapage_cache;
static mempool_t *metapage_mempool;
-#define MPS_PER_PAGE (PAGE_CACHE_SIZE >> L2PSIZE)
+#define MPS_PER_PAGE (PAGE_SIZE >> L2PSIZE)
#if MPS_PER_PAGE > 1
@@ -316,7 +316,7 @@ static void last_write_complete(struct page *page)
struct metapage *mp;
unsigned int offset;
- for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+ for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
mp = page_to_mp(page, offset);
if (mp && test_bit(META_io, &mp->flag)) {
if (mp->lsn)
@@ -366,12 +366,12 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
int bad_blocks = 0;
page_start = (sector_t)page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
set_page_writeback(page);
- for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+ for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
mp = page_to_mp(page, offset);
if (!mp || !test_bit(META_dirty, &mp->flag))
@@ -416,7 +416,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
bio = NULL;
} else
inc_io(page);
- xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
+ xlen = (PAGE_SIZE - offset) >> inode->i_blkbits;
pblock = metapage_get_blocks(inode, lblock, &xlen);
if (!pblock) {
printk(KERN_ERR "JFS: metapage_get_blocks failed\n");
@@ -485,7 +485,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
struct inode *inode = page->mapping->host;
struct bio *bio = NULL;
int block_offset;
- int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+ int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
sector_t page_start; /* address of page in fs blocks */
sector_t pblock;
int xlen;
@@ -494,7 +494,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
BUG_ON(!PageLocked(page));
page_start = (sector_t)page->index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
block_offset = 0;
while (block_offset < blocks_per_page) {
@@ -542,7 +542,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
int ret = 1;
int offset;
- for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+ for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
mp = page_to_mp(page, offset);
if (!mp)
@@ -568,7 +568,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
static void metapage_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- BUG_ON(offset || length < PAGE_CACHE_SIZE);
+ BUG_ON(offset || length < PAGE_SIZE);
BUG_ON(PageWriteback(page));
@@ -599,10 +599,10 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
inode->i_ino, lblock, absolute);
l2bsize = inode->i_blkbits;
- l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
+ l2BlocksPerPage = PAGE_SHIFT - l2bsize;
page_index = lblock >> l2BlocksPerPage;
page_offset = (lblock - (page_index << l2BlocksPerPage)) << l2bsize;
- if ((page_offset + size) > PAGE_CACHE_SIZE) {
+ if ((page_offset + size) > PAGE_SIZE) {
jfs_err("MetaData crosses page boundary!!");
jfs_err("lblock = %lx, size = %d", lblock, size);
dump_stack();
@@ -621,7 +621,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
mapping = inode->i_mapping;
}
- if (new && (PSIZE == PAGE_CACHE_SIZE)) {
+ if (new && (PSIZE == PAGE_SIZE)) {
page = grab_cache_page(mapping, page_index);
if (!page) {
jfs_err("grab_cache_page failed!");
@@ -693,7 +693,7 @@ unlock:
void grab_metapage(struct metapage * mp)
{
jfs_info("grab_metapage: mp = 0x%p", mp);
- page_cache_get(mp->page);
+ get_page(mp->page);
lock_page(mp->page);
mp->count++;
lock_metapage(mp);
@@ -706,12 +706,12 @@ void force_metapage(struct metapage *mp)
jfs_info("force_metapage: mp = 0x%p", mp);
set_bit(META_forcewrite, &mp->flag);
clear_bit(META_sync, &mp->flag);
- page_cache_get(page);
+ get_page(page);
lock_page(page);
set_page_dirty(page);
write_one_page(page, 1);
clear_bit(META_forcewrite, &mp->flag);
- page_cache_release(page);
+ put_page(page);
}
void hold_metapage(struct metapage *mp)
@@ -726,7 +726,7 @@ void put_metapage(struct metapage *mp)
unlock_page(mp->page);
return;
}
- page_cache_get(mp->page);
+ get_page(mp->page);
mp->count++;
lock_metapage(mp);
unlock_page(mp->page);
@@ -746,7 +746,7 @@ void release_metapage(struct metapage * mp)
assert(mp->count);
if (--mp->count || mp->nohomeok) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return;
}
@@ -764,13 +764,13 @@ void release_metapage(struct metapage * mp)
drop_metapage(page, mp);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
void __invalidate_metapages(struct inode *ip, s64 addr, int len)
{
sector_t lblock;
- int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits;
+ int l2BlocksPerPage = PAGE_SHIFT - ip->i_blkbits;
int BlocksPerPage = 1 << l2BlocksPerPage;
/* All callers are interested in block device's mapping */
struct address_space *mapping =
@@ -788,7 +788,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
page = find_lock_page(mapping, lblock >> l2BlocksPerPage);
if (!page)
continue;
- for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+ for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
mp = page_to_mp(page, offset);
if (!mp)
continue;
@@ -803,7 +803,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
remove_from_logsync(mp);
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index 337e9e51ac06..a869fb4a20d6 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -106,7 +106,7 @@ static inline void metapage_nohomeok(struct metapage *mp)
lock_page(page);
if (!mp->nohomeok++) {
mark_metapage_dirty(mp);
- page_cache_get(page);
+ get_page(page);
wait_on_page_writeback(page);
}
unlock_page(page);
@@ -128,7 +128,7 @@ static inline void metapage_wait_for_io(struct metapage *mp)
static inline void _metapage_homeok(struct metapage *mp)
{
if (!--mp->nohomeok)
- page_cache_release(mp->page);
+ put_page(mp->page);
}
static inline void metapage_homeok(struct metapage *mp)
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 35976bdccafc..701f89370de7 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -983,6 +983,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
ip->i_op = &jfs_symlink_inode_operations;
+ inode_nohighmem(ip);
ip->i_mapping->a_ops = &jfs_aops;
/*
@@ -1372,9 +1373,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
tid_t tid;
struct tblock *tblk;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
jfs_info("jfs_mknod: %pd", dentry);
rc = dquot_initialize(dir);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4cd9798f4948..78d599198bf5 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -496,9 +496,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
- if (!new_valid_dev(sb->s_bdev->bd_dev))
- return -EOVERFLOW;
-
sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
@@ -599,7 +596,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
* Page cache is indexed by long.
* I would use MAX_LFS_FILESIZE, but it's only half as big
*/
- sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1,
+ sb->s_maxbytes = min(((u64) PAGE_SIZE << 32) - 1,
(u64)sb->s_maxbytes);
#endif
sb->s_time_gran = 1;
@@ -795,7 +792,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
struct buffer_head tmp_bh;
struct buffer_head *bh;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
while (towrite > 0) {
tocopy = sb->s_blocksize - offset < towrite ?
sb->s_blocksize - offset : towrite;
@@ -827,7 +824,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
}
out:
if (len == towrite) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
if (inode->i_size < off+len-towrite)
@@ -835,7 +832,7 @@ out:
inode->i_version++;
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return len - towrite;
}
@@ -901,7 +898,7 @@ static int __init init_jfs_fs(void)
jfs_inode_cachep =
kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
init_once);
if (jfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 5929e2363cb8..f8db4fde0b0b 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -23,7 +23,7 @@
const struct inode_operations jfs_fast_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = jfs_setattr,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
@@ -33,8 +33,7 @@ const struct inode_operations jfs_fast_symlink_inode_operations = {
const struct inode_operations jfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = jfs_setattr,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 91e004518237..03b688d19f69 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -44,28 +44,122 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
}
-static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
- size_t buflen)
+/* kernfs_node_depth - compute depth from @from to @to */
+static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
{
- char *p = buf + buflen;
- int len;
+ size_t depth = 0;
- *--p = '\0';
+ while (to->parent && to != from) {
+ depth++;
+ to = to->parent;
+ }
+ return depth;
+}
- do {
- len = strlen(kn->name);
- if (p - buf < len + 1) {
- buf[0] = '\0';
- p = NULL;
- break;
- }
- p -= len;
- memcpy(p, kn->name, len);
- *--p = '/';
- kn = kn->parent;
- } while (kn && kn->parent);
+static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
+ struct kernfs_node *b)
+{
+ size_t da, db;
+ struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b);
+
+ if (ra != rb)
+ return NULL;
+
+ da = kernfs_depth(ra->kn, a);
+ db = kernfs_depth(rb->kn, b);
+
+ while (da > db) {
+ a = a->parent;
+ da--;
+ }
+ while (db > da) {
+ b = b->parent;
+ db--;
+ }
- return p;
+ /* worst case b and a will be the same at root */
+ while (b != a) {
+ b = b->parent;
+ a = a->parent;
+ }
+
+ return a;
+}
+
+/**
+ * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to,
+ * where kn_from is treated as root of the path.
+ * @kn_from: kernfs node which should be treated as root for the path
+ * @kn_to: kernfs node to which path is needed
+ * @buf: buffer to copy the path into
+ * @buflen: size of @buf
+ *
+ * We need to handle couple of scenarios here:
+ * [1] when @kn_from is an ancestor of @kn_to at some level
+ * kn_from: /n1/n2/n3
+ * kn_to: /n1/n2/n3/n4/n5
+ * result: /n4/n5
+ *
+ * [2] when @kn_from is on a different hierarchy and we need to find common
+ * ancestor between @kn_from and @kn_to.
+ * kn_from: /n1/n2/n3/n4
+ * kn_to: /n1/n2/n5
+ * result: /../../n5
+ * OR
+ * kn_from: /n1/n2/n3/n4/n5 [depth=5]
+ * kn_to: /n1/n2/n3 [depth=3]
+ * result: /../..
+ *
+ * return value: length of the string. If greater than buflen,
+ * then contents of buf are undefined. On error, -1 is returned.
+ */
+static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
+ struct kernfs_node *kn_from,
+ char *buf, size_t buflen)
+{
+ struct kernfs_node *kn, *common;
+ const char parent_str[] = "/..";
+ size_t depth_from, depth_to, len = 0, nlen = 0;
+ char *p;
+ int i;
+
+ if (!kn_from)
+ kn_from = kernfs_root(kn_to)->kn;
+
+ if (kn_from == kn_to)
+ return strlcpy(buf, "/", buflen);
+
+ common = kernfs_common_ancestor(kn_from, kn_to);
+ if (WARN_ON(!common))
+ return -1;
+
+ depth_to = kernfs_depth(common, kn_to);
+ depth_from = kernfs_depth(common, kn_from);
+
+ if (buf)
+ buf[0] = '\0';
+
+ for (i = 0; i < depth_from; i++)
+ len += strlcpy(buf + len, parent_str,
+ len < buflen ? buflen - len : 0);
+
+ /* Calculate how many bytes we need for the rest */
+ for (kn = kn_to; kn != common; kn = kn->parent)
+ nlen += strlen(kn->name) + 1;
+
+ if (len + nlen >= buflen)
+ return len + nlen;
+
+ p = buf + len + nlen;
+ *p = '\0';
+ for (kn = kn_to; kn != common; kn = kn->parent) {
+ nlen = strlen(kn->name);
+ p -= nlen;
+ memcpy(p, kn->name, nlen);
+ *(--p) = '/';
+ }
+
+ return len + nlen;
}
/**
@@ -115,6 +209,34 @@ size_t kernfs_path_len(struct kernfs_node *kn)
}
/**
+ * kernfs_path_from_node - build path of node @to relative to @from.
+ * @from: parent kernfs_node relative to which we need to build the path
+ * @to: kernfs_node of interest
+ * @buf: buffer to copy @to's path into
+ * @buflen: size of @buf
+ *
+ * Builds @to's path relative to @from in @buf. @from and @to must
+ * be on the same kernfs-root. If @from is not parent of @to, then a relative
+ * path (which includes '..'s) as needed to reach from @from to @to is
+ * returned.
+ *
+ * If @buf isn't long enough, the return value will be greater than @buflen
+ * and @buf contents are undefined.
+ */
+int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
+ char *buf, size_t buflen)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+ ret = kernfs_path_from_node_locked(to, from, buf, buflen);
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kernfs_path_from_node);
+
+/**
* kernfs_path - build full path of a given node
* @kn: kernfs_node of interest
* @buf: buffer to copy @kn's name into
@@ -127,13 +249,12 @@ size_t kernfs_path_len(struct kernfs_node *kn)
*/
char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
{
- unsigned long flags;
- char *p;
+ int ret;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
- p = kernfs_path_locked(kn, buf, buflen);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
- return p;
+ ret = kernfs_path_from_node(kn, NULL, buf, buflen);
+ if (ret < 0 || ret >= buflen)
+ return NULL;
+ return buf;
}
EXPORT_SYMBOL_GPL(kernfs_path);
@@ -164,17 +285,25 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
void pr_cont_kernfs_path(struct kernfs_node *kn)
{
unsigned long flags;
- char *p;
+ int sz;
spin_lock_irqsave(&kernfs_rename_lock, flags);
- p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
- sizeof(kernfs_pr_cont_buf));
- if (p)
- pr_cont("%s", p);
- else
- pr_cont("<name too long>");
+ sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
+ if (sz < 0) {
+ pr_cont("(error)");
+ goto out;
+ }
+
+ if (sz >= sizeof(kernfs_pr_cont_buf)) {
+ pr_cont("(name too long)");
+ goto out;
+ }
+
+ pr_cont("%s", kernfs_pr_cont_buf);
+out:
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
}
@@ -541,14 +670,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
if (!kn)
goto err_out1;
- /*
- * If the ino of the sysfs entry created for a kmem cache gets
- * allocated from an ida layer, which is accounted to the memcg that
- * owns the cache, the memcg will get pinned forever. So do not account
- * ino ida allocations.
- */
- ret = ida_simple_get(&root->ino_ida, 1, 0,
- GFP_KERNEL | __GFP_NOACCOUNT);
+ ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
if (ret < 0)
goto err_out2;
kn->ino = ret;
@@ -694,6 +816,38 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
return NULL;
}
+static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
+ const unsigned char *path,
+ const void *ns)
+{
+ size_t len;
+ char *p, *name;
+
+ lockdep_assert_held(&kernfs_mutex);
+
+ /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
+ spin_lock_irq(&kernfs_rename_lock);
+
+ len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
+
+ if (len >= sizeof(kernfs_pr_cont_buf)) {
+ spin_unlock_irq(&kernfs_rename_lock);
+ return NULL;
+ }
+
+ p = kernfs_pr_cont_buf;
+
+ while ((name = strsep(&p, "/")) && parent) {
+ if (*name == '\0')
+ continue;
+ parent = kernfs_find_ns(parent, name, ns);
+ }
+
+ spin_unlock_irq(&kernfs_rename_lock);
+
+ return parent;
+}
+
/**
* kernfs_find_and_get_ns - find and get kernfs_node with the given name
* @parent: kernfs_node to search under
@@ -719,6 +873,29 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
/**
+ * kernfs_walk_and_get_ns - find and get kernfs_node with the given path
+ * @parent: kernfs_node to search under
+ * @path: path to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with path @path under @parent and get a reference
+ * if found. This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
+ const char *path, const void *ns)
+{
+ struct kernfs_node *kn;
+
+ mutex_lock(&kernfs_mutex);
+ kn = kernfs_walk_ns(parent, path, ns);
+ kernfs_get(kn);
+ mutex_unlock(&kernfs_mutex);
+
+ return kn;
+}
+
+/**
* kernfs_create_root - create a new kernfs hierarchy
* @scops: optional syscall operations for the hierarchy
* @flags: KERNFS_ROOT_* flags
@@ -1472,9 +1649,9 @@ static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
struct inode *inode = file_inode(file);
loff_t ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = generic_file_llseek(file, offset, whence);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 756dd56aaf60..16405ae88d2d 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -205,7 +205,7 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
if (!attrs)
return -ENOMEM;
- return simple_xattr_remove(&attrs->xattrs, name);
+ return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE);
}
ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
@@ -230,7 +230,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
if (!attrs)
return -ENOMEM;
- return simple_xattr_list(&attrs->xattrs, buf, size);
+ return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
}
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 8eaf417187f1..f73541fbe7af 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -14,6 +14,7 @@
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/namei.h>
#include "kernfs-internal.h"
@@ -62,6 +63,74 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
return NULL;
}
+/*
+ * find the next ancestor in the path down to @child, where @parent was the
+ * ancestor whose descendant we want to find.
+ *
+ * Say the path is /a/b/c/d. @child is d, @parent is NULL. We return the root
+ * node. If @parent is b, then we return the node for c.
+ * Passing in d as @parent is not ok.
+ */
+static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
+ struct kernfs_node *parent)
+{
+ if (child == parent) {
+ pr_crit_once("BUG in find_next_ancestor: called with parent == child");
+ return NULL;
+ }
+
+ while (child->parent != parent) {
+ if (!child->parent)
+ return NULL;
+ child = child->parent;
+ }
+
+ return child;
+}
+
+/**
+ * kernfs_node_dentry - get a dentry for the given kernfs_node
+ * @kn: kernfs_node for which a dentry is needed
+ * @sb: the kernfs super_block
+ */
+struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
+ struct super_block *sb)
+{
+ struct dentry *dentry;
+ struct kernfs_node *knparent = NULL;
+
+ BUG_ON(sb->s_op != &kernfs_sops);
+
+ dentry = dget(sb->s_root);
+
+ /* Check if this is the root kernfs_node */
+ if (!kn->parent)
+ return dentry;
+
+ knparent = find_next_ancestor(kn, NULL);
+ if (WARN_ON(!knparent))
+ return ERR_PTR(-EINVAL);
+
+ do {
+ struct dentry *dtmp;
+ struct kernfs_node *kntmp;
+
+ if (kn == knparent)
+ return dentry;
+ kntmp = find_next_ancestor(kn, knparent);
+ if (WARN_ON(!kntmp))
+ return ERR_PTR(-EINVAL);
+ mutex_lock(&d_inode(dentry)->i_mutex);
+ dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name));
+ mutex_unlock(&d_inode(dentry)->i_mutex);
+ dput(dentry);
+ if (IS_ERR(dtmp))
+ return dtmp;
+ knparent = kntmp;
+ dentry = dtmp;
+ } while (true);
+}
+
static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
{
struct kernfs_super_info *info = kernfs_info(sb);
@@ -69,8 +138,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
struct dentry *root;
info->sb = sb;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = magic;
sb->s_op = &kernfs_sops;
sb->s_time_gran = 1;
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index db272528ab5b..117b8b3416f9 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -112,18 +112,25 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
return error;
}
-static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
+static const char *kernfs_iop_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- int error = -ENOMEM;
- unsigned long page = get_zeroed_page(GFP_KERNEL);
- if (!page)
+ char *body;
+ int error;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+ body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!body)
return ERR_PTR(-ENOMEM);
- error = kernfs_getlink(dentry, (char *)page);
+ error = kernfs_getlink(dentry, body);
if (unlikely(error < 0)) {
- free_page((unsigned long)page);
+ kfree(body);
return ERR_PTR(error);
}
- return *cookie = (char *)page;
+ set_delayed_call(done, kfree_link, body);
+ return body;
}
const struct inode_operations kernfs_symlink_iops = {
@@ -132,8 +139,7 @@ const struct inode_operations kernfs_symlink_iops = {
.getxattr = kernfs_iop_getxattr,
.listxattr = kernfs_iop_listxattr,
.readlink = generic_readlink,
- .follow_link = kernfs_iop_follow_link,
- .put_link = free_page_put_link,
+ .get_link = kernfs_iop_get_link,
.setattr = kernfs_iop_setattr,
.getattr = kernfs_iop_getattr,
.permission = kernfs_iop_permission,
diff --git a/fs/libfs.c b/fs/libfs.c
index c7cbfb092e94..f3fa82ce9b70 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -25,7 +25,7 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
generic_fillattr(inode, stat);
- stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
+ stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
return 0;
}
EXPORT_SYMBOL(simple_getattr);
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(simple_getattr);
int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
{
buf->f_type = dentry->d_sb->s_magic;
- buf->f_bsize = PAGE_CACHE_SIZE;
+ buf->f_bsize = PAGE_SIZE;
buf->f_namelen = NAME_MAX;
return 0;
}
@@ -89,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close);
loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
{
struct dentry *dentry = file->f_path.dentry;
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
switch (whence) {
case 1:
offset += file->f_pos;
@@ -97,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
break;
default:
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return -EINVAL;
}
if (offset != file->f_pos) {
@@ -124,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
spin_unlock(&dentry->d_lock);
}
}
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return offset;
}
EXPORT_SYMBOL(dcache_dir_lseek);
@@ -395,7 +395,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
struct page *page;
pgoff_t index;
- index = pos >> PAGE_CACHE_SHIFT;
+ index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
@@ -403,10 +403,10 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
*pagep = page;
- if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ if (!PageUptodate(page) && (len != PAGE_SIZE)) {
+ unsigned from = pos & (PAGE_SIZE - 1);
- zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
+ zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
}
return 0;
}
@@ -442,7 +442,7 @@ int simple_write_end(struct file *file, struct address_space *mapping,
/* zero the stale part of the page if we did a short copy */
if (copied < len) {
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from = pos & (PAGE_SIZE - 1);
zero_user(page, from + copied, len - copied);
}
@@ -458,7 +458,7 @@ int simple_write_end(struct file *file, struct address_space *mapping,
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
@@ -477,8 +477,8 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
struct dentry *dentry;
int i;
- s->s_blocksize = PAGE_CACHE_SIZE;
- s->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ s->s_blocksize = PAGE_SIZE;
+ s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = magic;
s->s_op = &simple_super_operations;
s->s_time_gran = 1;
@@ -941,7 +941,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = sync_mapping_buffers(inode->i_mapping);
if (!(inode->i_state & I_DIRTY_ALL))
goto out;
@@ -953,7 +953,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
ret = err;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
EXPORT_SYMBOL(__generic_file_fsync);
@@ -994,12 +994,12 @@ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
{
u64 last_fs_block = num_blocks - 1;
u64 last_fs_page =
- last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
+ last_fs_block >> (PAGE_SHIFT - blocksize_bits);
if (unlikely(num_blocks == 0))
return 0;
- if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+ if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
return -EINVAL;
if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
@@ -1019,17 +1019,12 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(noop_fsync);
-void kfree_put_link(struct inode *unused, void *cookie)
-{
- kfree(cookie);
-}
-EXPORT_SYMBOL(kfree_put_link);
-
-void free_page_put_link(struct inode *unused, void *cookie)
+/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
+void kfree_link(void *p)
{
- free_page((unsigned long) cookie);
+ kfree(p);
}
-EXPORT_SYMBOL(free_page_put_link);
+EXPORT_SYMBOL(kfree_link);
/*
* nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -1092,14 +1087,15 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
}
EXPORT_SYMBOL(simple_nosetlease);
-const char *simple_follow_link(struct dentry *dentry, void **cookie)
+const char *simple_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *done)
{
- return d_inode(dentry)->i_link;
+ return inode->i_link;
}
-EXPORT_SYMBOL(simple_follow_link);
+EXPORT_SYMBOL(simple_get_link);
const struct inode_operations simple_symlink_inode_operations = {
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.readlink = generic_readlink
};
EXPORT_SYMBOL(simple_symlink_inode_operations);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 969d589c848d..d716c9993a26 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -116,7 +116,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
atomic_inc(&nsm->sm_count);
else {
host = NULL;
- nsm = nsm_get_handle(ni->sap, ni->salen,
+ nsm = nsm_get_handle(ni->net, ni->sap, ni->salen,
ni->hostname, ni->hostname_len);
if (unlikely(nsm == NULL)) {
dprintk("lockd: %s failed; no nsm handle\n",
@@ -161,6 +161,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
host->h_nsmhandle = nsm;
host->h_addrbuf = nsm->sm_addrbuf;
host->net = ni->net;
+ strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
out:
return host;
@@ -534,17 +535,18 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
/**
* nlm_host_rebooted - Release all resources held by rebooted host
+ * @net: network namespace
* @info: pointer to decoded results of NLM_SM_NOTIFY call
*
* We were notified that the specified host has rebooted. Release
* all resources held by that peer.
*/
-void nlm_host_rebooted(const struct nlm_reboot *info)
+void nlm_host_rebooted(const struct net *net, const struct nlm_reboot *info)
{
struct nsm_handle *nsm;
struct nlm_host *host;
- nsm = nsm_reboot_lookup(info);
+ nsm = nsm_reboot_lookup(net, info);
if (unlikely(nsm == NULL))
return;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 47a32b6d9b90..19166d4a8d31 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -42,7 +42,7 @@ struct nsm_args {
u32 proc;
char *mon_name;
- char *nodename;
+ const char *nodename;
};
struct nsm_res {
@@ -51,7 +51,6 @@ struct nsm_res {
};
static const struct rpc_program nsm_program;
-static LIST_HEAD(nsm_handles);
static DEFINE_SPINLOCK(nsm_lock);
/*
@@ -87,69 +86,18 @@ static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
return rpc_create(&args);
}
-static struct rpc_clnt *nsm_client_set(struct lockd_net *ln,
- struct rpc_clnt *clnt)
-{
- spin_lock(&ln->nsm_clnt_lock);
- if (ln->nsm_users == 0) {
- if (clnt == NULL)
- goto out;
- ln->nsm_clnt = clnt;
- }
- clnt = ln->nsm_clnt;
- ln->nsm_users++;
-out:
- spin_unlock(&ln->nsm_clnt_lock);
- return clnt;
-}
-
-static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
-{
- struct rpc_clnt *clnt, *new;
- struct lockd_net *ln = net_generic(net, lockd_net_id);
-
- clnt = nsm_client_set(ln, NULL);
- if (clnt != NULL)
- goto out;
-
- clnt = new = nsm_create(net, nodename);
- if (IS_ERR(clnt))
- goto out;
-
- clnt = nsm_client_set(ln, new);
- if (clnt != new)
- rpc_shutdown_client(new);
-out:
- return clnt;
-}
-
-static void nsm_client_put(struct net *net)
-{
- struct lockd_net *ln = net_generic(net, lockd_net_id);
- struct rpc_clnt *clnt = NULL;
-
- spin_lock(&ln->nsm_clnt_lock);
- ln->nsm_users--;
- if (ln->nsm_users == 0) {
- clnt = ln->nsm_clnt;
- ln->nsm_clnt = NULL;
- }
- spin_unlock(&ln->nsm_clnt_lock);
- if (clnt != NULL)
- rpc_shutdown_client(clnt);
-}
-
static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
- struct rpc_clnt *clnt)
+ const struct nlm_host *host)
{
int status;
+ struct rpc_clnt *clnt;
struct nsm_args args = {
.priv = &nsm->sm_priv,
.prog = NLM_PROGRAM,
.vers = 3,
.proc = NLMPROC_NSM_NOTIFY,
.mon_name = nsm->sm_mon_name,
- .nodename = clnt->cl_nodename,
+ .nodename = host->nodename,
};
struct rpc_message msg = {
.rpc_argp = &args,
@@ -158,6 +106,13 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
memset(res, 0, sizeof(*res));
+ clnt = nsm_create(host->net, host->nodename);
+ if (IS_ERR(clnt)) {
+ dprintk("lockd: failed to create NSM upcall transport, "
+ "status=%ld, net=%p\n", PTR_ERR(clnt), host->net);
+ return PTR_ERR(clnt);
+ }
+
msg.rpc_proc = &clnt->cl_procinfo[proc];
status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
if (status == -ECONNREFUSED) {
@@ -171,6 +126,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
status);
else
status = 0;
+
+ rpc_shutdown_client(clnt);
return status;
}
@@ -190,32 +147,19 @@ int nsm_monitor(const struct nlm_host *host)
struct nsm_handle *nsm = host->h_nsmhandle;
struct nsm_res res;
int status;
- struct rpc_clnt *clnt;
- const char *nodename = NULL;
dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
if (nsm->sm_monitored)
return 0;
- if (host->h_rpcclnt)
- nodename = host->h_rpcclnt->cl_nodename;
-
/*
* Choose whether to record the caller_name or IP address of
* this peer in the local rpc.statd's database.
*/
nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
- clnt = nsm_client_get(host->net, nodename);
- if (IS_ERR(clnt)) {
- status = PTR_ERR(clnt);
- dprintk("lockd: failed to create NSM upcall transport, "
- "status=%d, net=%p\n", status, host->net);
- return status;
- }
-
- status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
+ status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host);
if (unlikely(res.status != 0))
status = -EIO;
if (unlikely(status < 0)) {
@@ -247,11 +191,9 @@ void nsm_unmonitor(const struct nlm_host *host)
if (atomic_read(&nsm->sm_count) == 1
&& nsm->sm_monitored && !nsm->sm_sticky) {
- struct lockd_net *ln = net_generic(host->net, lockd_net_id);
-
dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
- status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
+ status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host);
if (res.status != 0)
status = -EIO;
if (status < 0)
@@ -259,38 +201,38 @@ void nsm_unmonitor(const struct nlm_host *host)
nsm->sm_name);
else
nsm->sm_monitored = 0;
-
- nsm_client_put(host->net);
}
}
-static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
- const size_t len)
+static struct nsm_handle *nsm_lookup_hostname(const struct list_head *nsm_handles,
+ const char *hostname, const size_t len)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (strlen(nsm->sm_name) == len &&
memcmp(nsm->sm_name, hostname, len) == 0)
return nsm;
return NULL;
}
-static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+static struct nsm_handle *nsm_lookup_addr(const struct list_head *nsm_handles,
+ const struct sockaddr *sap)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (rpc_cmp_addr(nsm_addr(nsm), sap))
return nsm;
return NULL;
}
-static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+static struct nsm_handle *nsm_lookup_priv(const struct list_head *nsm_handles,
+ const struct nsm_private *priv)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (memcmp(nsm->sm_priv.data, priv->data,
sizeof(priv->data)) == 0)
return nsm;
@@ -353,6 +295,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
/**
* nsm_get_handle - Find or create a cached nsm_handle
+ * @net: network namespace
* @sap: pointer to socket address of handle to find
* @salen: length of socket address
* @hostname: pointer to C string containing hostname to find
@@ -365,11 +308,13 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
* @hostname cannot be found in the handle cache. Returns NULL if
* an error occurs.
*/
-struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+struct nsm_handle *nsm_get_handle(const struct net *net,
+ const struct sockaddr *sap,
const size_t salen, const char *hostname,
const size_t hostname_len)
{
struct nsm_handle *cached, *new = NULL;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
if (printk_ratelimit()) {
@@ -384,9 +329,10 @@ retry:
spin_lock(&nsm_lock);
if (nsm_use_hostnames && hostname != NULL)
- cached = nsm_lookup_hostname(hostname, hostname_len);
+ cached = nsm_lookup_hostname(&ln->nsm_handles,
+ hostname, hostname_len);
else
- cached = nsm_lookup_addr(sap);
+ cached = nsm_lookup_addr(&ln->nsm_handles, sap);
if (cached != NULL) {
atomic_inc(&cached->sm_count);
@@ -400,7 +346,7 @@ retry:
}
if (new != NULL) {
- list_add(&new->sm_link, &nsm_handles);
+ list_add(&new->sm_link, &ln->nsm_handles);
spin_unlock(&nsm_lock);
dprintk("lockd: created nsm_handle for %s (%s)\n",
new->sm_name, new->sm_addrbuf);
@@ -417,19 +363,22 @@ retry:
/**
* nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @net: network namespace
* @info: pointer to NLMPROC_SM_NOTIFY arguments
*
* Returns a matching nsm_handle if found in the nsm cache. The returned
* nsm_handle's reference count is bumped. Otherwise returns NULL if some
* error occurred.
*/
-struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+struct nsm_handle *nsm_reboot_lookup(const struct net *net,
+ const struct nlm_reboot *info)
{
struct nsm_handle *cached;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
spin_lock(&nsm_lock);
- cached = nsm_lookup_priv(&info->priv);
+ cached = nsm_lookup_priv(&ln->nsm_handles, &info->priv);
if (unlikely(cached == NULL)) {
spin_unlock(&nsm_lock);
dprintk("lockd: never saw rebooted peer '%.*s' before\n",
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 097bfa3adb1c..5426189406c1 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -12,9 +12,7 @@ struct lockd_net {
struct delayed_work grace_period_end;
struct lock_manager lockd_manager;
- spinlock_t nsm_clnt_lock;
- unsigned int nsm_users;
- struct rpc_clnt *nsm_clnt;
+ struct list_head nsm_handles;
};
extern int lockd_net_id;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index d678bcc3cbcb..154a107cd376 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -25,13 +25,17 @@
#include <linux/mutex.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/inetdevice.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <net/ip.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
#include <linux/lockd/lockd.h>
#include <linux/nfs.h>
@@ -44,7 +48,7 @@
static struct svc_program nlmsvc_program;
-struct nlmsvc_binding * nlmsvc_ops;
+const struct nlmsvc_binding *nlmsvc_ops;
EXPORT_SYMBOL_GPL(nlmsvc_ops);
static DEFINE_MUTEX(nlmsvc_mutex);
@@ -90,8 +94,7 @@ static unsigned long get_lockd_grace_period(void)
static void grace_ender(struct work_struct *grace)
{
- struct delayed_work *dwork = container_of(grace, struct delayed_work,
- work);
+ struct delayed_work *dwork = to_delayed_work(grace);
struct lockd_net *ln = container_of(dwork, struct lockd_net,
grace_period_end);
@@ -279,6 +282,68 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
}
}
+static int lockd_inetaddr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct sockaddr_in sin;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nlmsvc_rqst) {
+ dprintk("lockd_inetaddr_event: removed %pI4\n",
+ &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+ svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+ (struct sockaddr *)&sin);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lockd_inetaddr_notifier = {
+ .notifier_call = lockd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int lockd_inet6addr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct sockaddr_in6 sin6;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nlmsvc_rqst) {
+ dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ifa->addr;
+ svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+ (struct sockaddr *)&sin6);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lockd_inet6addr_notifier = {
+ .notifier_call = lockd_inet6addr_event,
+};
+#endif
+
+static void lockd_svc_exit_thread(void)
+{
+ unregister_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
+ svc_exit_thread(nlmsvc_rqst);
+}
+
static int lockd_start_svc(struct svc_serv *serv)
{
int error;
@@ -315,7 +380,7 @@ static int lockd_start_svc(struct svc_serv *serv)
return 0;
out_task:
- svc_exit_thread(nlmsvc_rqst);
+ lockd_svc_exit_thread();
nlmsvc_task = NULL;
out_rqst:
nlmsvc_rqst = NULL;
@@ -360,6 +425,10 @@ static struct svc_serv *lockd_create_svc(void)
printk(KERN_WARNING "lockd_up: create service failed\n");
return ERR_PTR(-ENOMEM);
}
+ register_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ register_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
dprintk("lockd_up: service created\n");
return serv;
}
@@ -428,7 +497,7 @@ lockd_down(struct net *net)
}
kthread_stop(nlmsvc_task);
dprintk("lockd_down: service stopped\n");
- svc_exit_thread(nlmsvc_rqst);
+ lockd_svc_exit_thread();
dprintk("lockd_down: service destroyed\n");
nlmsvc_task = NULL;
nlmsvc_rqst = NULL;
@@ -592,7 +661,7 @@ static int lockd_init_net(struct net *net)
INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
INIT_LIST_HEAD(&ln->lockd_manager.list);
ln->lockd_manager.block_opens = false;
- spin_lock_init(&ln->nsm_clnt_lock);
+ INIT_LIST_HEAD(&ln->nsm_handles);
return 0;
}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index b147d1ae71fd..09c576f26c7b 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -421,7 +421,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
return rpc_system_err;
}
- nlm_host_rebooted(argp);
+ nlm_host_rebooted(SVC_NET(rqstp), argp);
return rpc_success;
}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 21171f0c6477..fb26b9f522e7 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -464,7 +464,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
return rpc_system_err;
}
- nlm_host_rebooted(argp);
+ nlm_host_rebooted(SVC_NET(rqstp), argp);
return rpc_success;
}
diff --git a/fs/locks.c b/fs/locks.c
index 0d2b3267e2a3..7c5f91be9b65 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -119,7 +119,6 @@
#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
@@ -230,16 +229,44 @@ locks_get_lock_context(struct inode *inode, int type)
ctx = smp_load_acquire(&inode->i_flctx);
}
out:
+ trace_locks_get_lock_context(inode, type, ctx);
return ctx;
}
+static void
+locks_dump_ctx_list(struct list_head *list, char *list_type)
+{
+ struct file_lock *fl;
+
+ list_for_each_entry(fl, list, fl_list) {
+ pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
+ }
+}
+
+static void
+locks_check_ctx_lists(struct inode *inode)
+{
+ struct file_lock_context *ctx = inode->i_flctx;
+
+ if (unlikely(!list_empty(&ctx->flc_flock) ||
+ !list_empty(&ctx->flc_posix) ||
+ !list_empty(&ctx->flc_lease))) {
+ pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n",
+ MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
+ inode->i_ino);
+ locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
+ locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
+ locks_dump_ctx_list(&ctx->flc_lease, "LEASE");
+ }
+}
+
void
-locks_free_lock_context(struct file_lock_context *ctx)
+locks_free_lock_context(struct inode *inode)
{
- if (ctx) {
- WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
- WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
- WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
+ struct file_lock_context *ctx = inode->i_flctx;
+
+ if (unlikely(ctx)) {
+ locks_check_ctx_lists(inode);
kmem_cache_free(flctx_cache, ctx);
}
}
@@ -934,7 +961,8 @@ out:
return error;
}
-static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
+static int posix_lock_inode(struct inode *inode, struct file_lock *request,
+ struct file_lock *conflock)
{
struct file_lock *fl, *tmp;
struct file_lock *new_fl = NULL;
@@ -1142,6 +1170,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
if (new_fl2)
locks_free_lock(new_fl2);
locks_dispose_list(&dispose);
+ trace_posix_lock_inode(inode, request, error);
+
return error;
}
@@ -1162,7 +1192,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
int posix_lock_file(struct file *filp, struct file_lock *fl,
struct file_lock *conflock)
{
- return __posix_lock_file(file_inode(filp), fl, conflock);
+ return posix_lock_inode(file_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);
@@ -1178,7 +1208,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
int error;
might_sleep ();
for (;;) {
- error = __posix_lock_file(inode, fl, NULL);
+ error = posix_lock_inode(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1191,6 +1221,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
return error;
}
+#ifdef CONFIG_MANDATORY_FILE_LOCKING
/**
* locks_mandatory_locked - Check for an active lock
* @file: the file to check
@@ -1227,20 +1258,16 @@ int locks_mandatory_locked(struct file *file)
/**
* locks_mandatory_area - Check for a conflicting lock
- * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ
- * for shared
- * @inode: the file to check
+ * @inode: the file to check
* @filp: how the file was opened (if it was)
- * @offset: start of area to check
- * @count: length of area to check
+ * @start: first byte in the file to check
+ * @end: lastbyte in the file to check
+ * @type: %F_WRLCK for a write lock, else %F_RDLCK
*
* Searches the inode's list of locks to find any POSIX locks which conflict.
- * This function is called from rw_verify_area() and
- * locks_verify_truncate().
*/
-int locks_mandatory_area(int read_write, struct inode *inode,
- struct file *filp, loff_t offset,
- size_t count)
+int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
+ loff_t end, unsigned char type)
{
struct file_lock fl;
int error;
@@ -1252,15 +1279,15 @@ int locks_mandatory_area(int read_write, struct inode *inode,
fl.fl_flags = FL_POSIX | FL_ACCESS;
if (filp && !(filp->f_flags & O_NONBLOCK))
sleep = true;
- fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
- fl.fl_start = offset;
- fl.fl_end = offset + count - 1;
+ fl.fl_type = type;
+ fl.fl_start = start;
+ fl.fl_end = end;
for (;;) {
if (filp) {
fl.fl_owner = filp;
fl.fl_flags &= ~FL_SLEEP;
- error = __posix_lock_file(inode, &fl, NULL);
+ error = posix_lock_inode(inode, &fl, NULL);
if (!error)
break;
}
@@ -1268,7 +1295,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
if (sleep)
fl.fl_flags |= FL_SLEEP;
fl.fl_owner = current->files;
- error = __posix_lock_file(inode, &fl, NULL);
+ error = posix_lock_inode(inode, &fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
@@ -1289,6 +1316,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
}
EXPORT_SYMBOL(locks_mandatory_area);
+#endif /* CONFIG_MANDATORY_FILE_LOCKING */
static void lease_clear_pending(struct file_lock *fl, int arg)
{
@@ -1503,12 +1531,10 @@ void lease_get_mtime(struct inode *inode, struct timespec *time)
ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
- if (!list_empty(&ctx->flc_lease)) {
- fl = list_first_entry(&ctx->flc_lease,
- struct file_lock, fl_list);
- if (fl->fl_type == F_WRLCK)
- has_lease = true;
- }
+ fl = list_first_entry_or_null(&ctx->flc_lease,
+ struct file_lock, fl_list);
+ if (fl && (fl->fl_type == F_WRLCK))
+ has_lease = true;
spin_unlock(&ctx->flc_lock);
}
@@ -1624,12 +1650,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
* bother, maybe that's a sign this just isn't a good file to
* hand out a delegation on.
*/
- if (is_deleg && !mutex_trylock(&inode->i_mutex))
+ if (is_deleg && !inode_trylock(inode))
return -EAGAIN;
if (is_deleg && arg == F_WRLCK) {
/* Write delegations are not currently supported: */
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
WARN_ON_ONCE(1);
return -EINVAL;
}
@@ -1706,7 +1732,7 @@ out:
spin_unlock(&ctx->flc_lock);
locks_dispose_list(&dispose);
if (is_deleg)
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!error && !my_fl)
*flp = NULL;
return error;
@@ -2165,6 +2191,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (file_lock == NULL)
return -ENOLCK;
+ inode = file_inode(filp);
+
/*
* This might block, so we do it before checking the inode.
*/
@@ -2172,8 +2200,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
if (copy_from_user(&flock, l, sizeof(flock)))
goto out;
- inode = file_inode(filp);
-
/* Don't allow mandatory locks on files that may be memory mapped
* and shared.
*/
@@ -2182,7 +2208,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
}
-again:
error = flock_to_posix_lock(filp, file_lock, &flock);
if (error)
goto out;
@@ -2221,23 +2246,29 @@ again:
error = do_lock_file_wait(filp, cmd, file_lock);
/*
- * Attempt to detect a close/fcntl race and recover by
- * releasing the lock that was just acquired.
- */
- /*
- * we need that spin_lock here - it prevents reordering between
- * update of i_flctx->flc_posix and check for it done in close().
- * rcu_read_lock() wouldn't do.
+ * Attempt to detect a close/fcntl race and recover by releasing the
+ * lock that was just acquired. There is no need to do that when we're
+ * unlocking though, or for OFD locks.
*/
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
- goto again;
+ if (!error && file_lock->fl_type != F_UNLCK &&
+ !(file_lock->fl_flags & FL_OFDLCK)) {
+ /*
+ * We need that spin_lock here - it prevents reordering between
+ * update of i_flctx->flc_posix and check for it done in
+ * close(). rcu_read_lock() wouldn't do.
+ */
+ spin_lock(&current->files->file_lock);
+ f = fcheck(fd);
+ spin_unlock(&current->files->file_lock);
+ if (f != filp) {
+ file_lock->fl_type = F_UNLCK;
+ error = do_lock_file_wait(filp, cmd, file_lock);
+ WARN_ON_ONCE(error);
+ error = -EBADF;
+ }
}
-
out:
+ trace_fcntl_setlk(inode, file_lock, error);
locks_free_lock(file_lock);
return error;
}
@@ -2322,7 +2353,6 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
goto out;
}
-again:
error = flock64_to_posix_lock(filp, file_lock, &flock);
if (error)
goto out;
@@ -2361,17 +2391,27 @@ again:
error = do_lock_file_wait(filp, cmd, file_lock);
/*
- * Attempt to detect a close/fcntl race and recover by
- * releasing the lock that was just acquired.
+ * Attempt to detect a close/fcntl race and recover by releasing the
+ * lock that was just acquired. There is no need to do that when we're
+ * unlocking though, or for OFD locks.
*/
- spin_lock(&current->files->file_lock);
- f = fcheck(fd);
- spin_unlock(&current->files->file_lock);
- if (!error && f != filp && flock.l_type != F_UNLCK) {
- flock.l_type = F_UNLCK;
- goto again;
+ if (!error && file_lock->fl_type != F_UNLCK &&
+ !(file_lock->fl_flags & FL_OFDLCK)) {
+ /*
+ * We need that spin_lock here - it prevents reordering between
+ * update of i_flctx->flc_posix and check for it done in
+ * close(). rcu_read_lock() wouldn't do.
+ */
+ spin_lock(&current->files->file_lock);
+ f = fcheck(fd);
+ spin_unlock(&current->files->file_lock);
+ if (f != filp) {
+ file_lock->fl_type = F_UNLCK;
+ error = do_lock_file_wait(filp, cmd, file_lock);
+ WARN_ON_ONCE(error);
+ error = -EBADF;
+ }
}
-
out:
locks_free_lock(file_lock);
return error;
@@ -2385,6 +2425,7 @@ out:
*/
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
+ int error;
struct file_lock lock;
struct file_lock_context *ctx;
@@ -2407,10 +2448,11 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
lock.fl_ops = NULL;
lock.fl_lmops = NULL;
- vfs_lock_file(filp, F_SETLK, &lock, NULL);
+ error = vfs_lock_file(filp, F_SETLK, &lock, NULL);
if (lock.fl_ops && lock.fl_ops->fl_release_private)
lock.fl_ops->fl_release_private(&lock);
+ trace_locks_remove_posix(file_inode(filp), &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);
@@ -2706,7 +2748,7 @@ static int __init proc_locks_init(void)
proc_create("locks", 0, NULL, &proc_locks_operations);
return 0;
}
-module_init(proc_locks_init);
+fs_initcall(proc_locks_init);
#endif
static int __init filelock_init(void)
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
index 09ed066c0221..2b4503163930 100644
--- a/fs/logfs/Kconfig
+++ b/fs/logfs/Kconfig
@@ -1,6 +1,6 @@
config LOGFS
tristate "LogFS file system"
- depends on (MTD || BLOCK)
+ depends on MTD || (!MTD && BLOCK)
select ZLIB_INFLATE
select ZLIB_DEFLATE
select CRC32
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a709d80c8ebc..cc26f8f215f5 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -64,7 +64,7 @@ static void writeseg_end_io(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i) {
end_page_writeback(bvec->bv_page);
- page_cache_release(bvec->bv_page);
+ put_page(bvec->bv_page);
}
bio_put(bio);
if (atomic_dec_and_test(&super->s_pending_writes))
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 9c501449450d..b76a62b1978f 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -46,9 +46,9 @@ static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
- BUG_ON(len > PAGE_CACHE_SIZE);
- page_start = ofs & PAGE_CACHE_MASK;
- page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
+ BUG_ON(len > PAGE_SIZE);
+ page_start = ofs & PAGE_MASK;
+ page_end = PAGE_ALIGN(ofs + len) - 1;
ret = mtd_write(mtd, ofs, len, &retlen, buf);
if (ret || (retlen != len))
return -EIO;
@@ -82,7 +82,7 @@ static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
if (!page)
continue;
memset(page_address(page), 0xFF, PAGE_SIZE);
- page_cache_release(page);
+ put_page(page);
}
return 0;
}
@@ -195,7 +195,7 @@ static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
page_address(page));
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (err)
return err;
}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f9b45d46d4c4..ddbed2be5366 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -183,7 +183,7 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
if (name->len != be16_to_cpu(dd->namelen) ||
memcmp(name->name, dd->name, name->len)) {
kunmap_atomic(dd);
- page_cache_release(page);
+ put_page(page);
continue;
}
@@ -238,7 +238,7 @@ static int logfs_unlink(struct inode *dir, struct dentry *dentry)
return PTR_ERR(page);
}
index = page->index;
- page_cache_release(page);
+ put_page(page);
mutex_lock(&super->s_dirop_mutex);
logfs_add_transaction(dir, ta);
@@ -316,7 +316,7 @@ static int logfs_readdir(struct file *file, struct dir_context *ctx)
be16_to_cpu(dd->namelen),
be64_to_cpu(dd->ino), dd->type);
kunmap(page);
- page_cache_release(page);
+ put_page(page);
if (full)
break;
}
@@ -349,7 +349,7 @@ static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
dd = kmap_atomic(page);
ino = be64_to_cpu(dd->ino);
kunmap_atomic(dd);
- page_cache_release(page);
+ put_page(page);
inode = logfs_iget(dir->i_sb, ino);
if (IS_ERR(inode))
@@ -392,7 +392,7 @@ static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
err = logfs_write_buf(dir, page, WF_LOCK);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (!err)
grow_dir(dir, index);
return err;
@@ -528,7 +528,8 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
return PTR_ERR(inode);
- inode->i_op = &logfs_symlink_iops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &logfs_reg_aops;
return __logfs_create(dir, dentry, inode, target, destlen);
@@ -560,7 +561,7 @@ static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
map = kmap_atomic(page);
memcpy(dd, map, sizeof(*dd));
kunmap_atomic(map);
- page_cache_release(page);
+ put_page(page);
return 0;
}
@@ -776,12 +777,6 @@ fail:
return -EIO;
}
-const struct inode_operations logfs_symlink_iops = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
-};
-
const struct inode_operations logfs_dir_iops = {
.create = logfs_create,
.link = logfs_link,
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 1a6f0167b16a..f01ddfb1a03b 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -15,21 +15,21 @@ static int logfs_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct page *page;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
- if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+ if ((len == PAGE_SIZE) || PageUptodate(page))
return 0;
- if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ if ((pos & PAGE_MASK) >= i_size_read(inode)) {
+ unsigned start = pos & (PAGE_SIZE - 1);
unsigned end = start + len;
/* Reading beyond i_size is simple: memset to zero */
- zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+ zero_user_segments(page, 0, start, end, PAGE_SIZE);
return 0;
}
return logfs_readpage_nolock(page);
@@ -41,11 +41,11 @@ static int logfs_write_end(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
pgoff_t index = page->index;
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned start = pos & (PAGE_SIZE - 1);
unsigned end = start + copied;
int ret = 0;
- BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+ BUG_ON(PAGE_SIZE != inode->i_sb->s_blocksize);
BUG_ON(page->index > I3_BLOCKS);
if (copied < len) {
@@ -61,8 +61,8 @@ static int logfs_write_end(struct file *file, struct address_space *mapping,
if (copied == 0)
goto out; /* FIXME: do we need to update inode? */
- if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
- i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+ if (i_size_read(inode) < (index << PAGE_SHIFT) + end) {
+ i_size_write(inode, (index << PAGE_SHIFT) + end);
mark_inode_dirty_sync(inode);
}
@@ -75,7 +75,7 @@ static int logfs_write_end(struct file *file, struct address_space *mapping,
}
out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return ret ? ret : copied;
}
@@ -118,7 +118,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned offset;
u64 bix;
level_t level;
@@ -142,7 +142,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
return __logfs_writepage(page);
/* Is the page fully outside i_size? (truncate in progress) */
- offset = i_size & (PAGE_CACHE_SIZE-1);
+ offset = i_size & (PAGE_SIZE-1);
if (bix > end_index || offset == 0) {
unlock_page(page);
return 0; /* don't care */
@@ -155,7 +155,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
*/
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
return __logfs_writepage(page);
}
@@ -204,12 +204,12 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
oldflags = li->li_flags;
flags &= LOGFS_FL_USER_MODIFIABLE;
flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
li->li_flags = flags;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
inode->i_ctime = CURRENT_TIME;
mark_inode_dirty_sync(inode);
@@ -230,11 +230,11 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
logfs_get_wblocks(sb, NULL, WF_LOCK);
logfs_write_anchor(sb);
logfs_put_wblocks(sb, NULL, WF_LOCK);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return 0;
}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index af49e2d6941a..db9cfc598883 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -64,7 +64,8 @@ static void logfs_inode_setops(struct inode *inode)
inode->i_mapping->a_ops = &logfs_reg_aops;
break;
case S_IFLNK:
- inode->i_op = &logfs_symlink_iops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &logfs_reg_aops;
break;
case S_IFSOCK: /* fall through */
@@ -408,7 +409,8 @@ const struct super_operations logfs_super_operations = {
int logfs_init_inode_cache(void)
{
logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
- sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+ sizeof(struct logfs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
logfs_init_once);
if (!logfs_inode_cache)
return -ENOMEM;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 5f0937609465..27d040e35faa 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -302,7 +302,7 @@ struct logfs_block {
struct inode *inode;
struct logfs_transaction *ta;
unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
- struct logfs_block_ops *ops;
+ const struct logfs_block_ops *ops;
int full;
int partial;
int reserved_bytes;
@@ -485,7 +485,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s,
#endif
/* dev_mtd.c */
-#ifdef CONFIG_MTD
+#if IS_ENABLED(CONFIG_MTD)
int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
#else
static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
@@ -495,7 +495,6 @@ static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
#endif
/* dir.c */
-extern const struct inode_operations logfs_symlink_iops;
extern const struct inode_operations logfs_dir_iops;
extern const struct file_operations logfs_dir_fops;
int logfs_replay_journal(struct super_block *sb);
@@ -579,7 +578,7 @@ int logfs_exist_block(struct inode *inode, u64 bix);
int get_page_reserve(struct inode *inode, struct page *page);
void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
-extern struct logfs_block_ops indirect_block_ops;
+extern const struct logfs_block_ops indirect_block_ops;
/* segment.c */
int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 380d86e1ab45..3fb8c6d67303 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -281,7 +281,7 @@ static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
static void logfs_put_read_page(struct page *page)
{
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
static void logfs_lock_write_page(struct page *page)
@@ -323,7 +323,7 @@ repeat:
return NULL;
err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
if (unlikely(err)) {
- page_cache_release(page);
+ put_page(page);
if (err == -EEXIST)
goto repeat;
return NULL;
@@ -342,7 +342,7 @@ static void logfs_unlock_write_page(struct page *page)
static void logfs_put_write_page(struct page *page)
{
logfs_unlock_write_page(page);
- page_cache_release(page);
+ put_page(page);
}
static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
@@ -562,20 +562,20 @@ static void indirect_free_block(struct super_block *sb,
if (PagePrivate(page)) {
ClearPagePrivate(page);
- page_cache_release(page);
+ put_page(page);
set_page_private(page, 0);
}
__free_block(sb, block);
}
-static struct logfs_block_ops inode_block_ops = {
+static const struct logfs_block_ops inode_block_ops = {
.write_block = inode_write_block,
.free_block = inode_free_block,
.write_alias = inode_write_alias,
};
-struct logfs_block_ops indirect_block_ops = {
+const struct logfs_block_ops indirect_block_ops = {
.write_block = indirect_write_block,
.free_block = indirect_free_block,
.write_alias = indirect_write_alias,
@@ -655,7 +655,7 @@ static void alloc_data_block(struct inode *inode, struct page *page)
block->page = page;
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
set_page_private(page, (unsigned long) block);
block->ops = &indirect_block_ops;
@@ -709,7 +709,7 @@ static u64 block_get_pointer(struct page *page, int index)
static int logfs_read_empty(struct page *page)
{
- zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_SIZE);
return 0;
}
@@ -1660,7 +1660,7 @@ static int truncate_data_block(struct inode *inode, struct page *page,
if (err)
return err;
- zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
+ zero_user_segment(page, size - pageofs, PAGE_SIZE);
return logfs_segment_write(inode, page, shadow);
}
@@ -1919,7 +1919,7 @@ static void move_page_to_inode(struct inode *inode, struct page *page)
block->page = NULL;
if (PagePrivate(page)) {
ClearPagePrivate(page);
- page_cache_release(page);
+ put_page(page);
set_page_private(page, 0);
}
}
@@ -1940,7 +1940,7 @@ static void move_inode_to_page(struct page *page, struct inode *inode)
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
set_page_private(page, (unsigned long) block);
}
@@ -1971,7 +1971,7 @@ int logfs_read_inode(struct inode *inode)
logfs_disk_to_inode(di, inode);
kunmap_atomic(di);
move_page_to_inode(inode, page);
- page_cache_release(page);
+ put_page(page);
return 0;
}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 7f9b096d8d57..1efd6055f4b0 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -57,7 +57,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
filler_t *filler = super->s_devops->readpage;
struct page *page;
- BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+ BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS));
if (use_filler)
page = read_cache_page(mapping, index, filler, sb);
else {
@@ -90,9 +90,9 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
}
- page_cache_release(page);
+ put_page(page);
buf += copylen;
len -= copylen;
@@ -117,9 +117,9 @@ static void pad_partial_page(struct logfs_area *area)
memset(page_address(page) + offset, 0xff, len);
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
}
- page_cache_release(page);
+ put_page(page);
}
}
@@ -129,20 +129,20 @@ static void pad_full_pages(struct logfs_area *area)
struct logfs_super *super = logfs_super(sb);
u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
u32 len = super->s_segsize - area->a_used_bytes;
- pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
- pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+ pgoff_t index = PAGE_ALIGN(ofs) >> PAGE_SHIFT;
+ pgoff_t no_indizes = len >> PAGE_SHIFT;
struct page *page;
while (no_indizes) {
page = get_mapping_page(sb, index, 0);
BUG_ON(!page); /* FIXME: reserve a pool */
SetPageUptodate(page);
- memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
+ memset(page_address(page), 0xff, PAGE_SIZE);
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
}
- page_cache_release(page);
+ put_page(page);
index++;
no_indizes--;
}
@@ -197,7 +197,7 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
return 0;
}
-static struct logfs_block_ops btree_block_ops = {
+static const struct logfs_block_ops btree_block_ops = {
.write_block = btree_write_block,
.free_block = __free_block,
.write_alias = btree_write_alias,
@@ -411,7 +411,7 @@ int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
if (IS_ERR(page))
return PTR_ERR(page);
memcpy(buf, page_address(page) + offset, copylen);
- page_cache_release(page);
+ put_page(page);
buf += copylen;
len -= copylen;
@@ -499,7 +499,7 @@ static void move_btree_to_page(struct inode *inode, struct page *page,
if (!PagePrivate(page)) {
SetPagePrivate(page);
- page_cache_get(page);
+ get_page(page);
set_page_private(page, (unsigned long) block);
}
block->ops = &indirect_block_ops;
@@ -554,7 +554,7 @@ void move_page_to_btree(struct page *page)
if (PagePrivate(page)) {
ClearPagePrivate(page);
- page_cache_release(page);
+ put_page(page);
set_page_private(page, 0);
}
block->ops = &btree_block_ops;
@@ -723,9 +723,9 @@ void freeseg(struct super_block *sb, u32 segno)
continue;
if (PagePrivate(page)) {
ClearPagePrivate(page);
- page_cache_release(page);
+ put_page(page);
}
- page_cache_release(page);
+ put_page(page);
}
}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 54360293bcb5..5751082dba52 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -48,7 +48,7 @@ void emergency_read_end(struct page *page)
if (page == emergency_page)
mutex_unlock(&emergency_mutex);
else
- page_cache_release(page);
+ put_page(page);
}
static void dump_segfile(struct super_block *sb)
@@ -206,7 +206,7 @@ static int write_one_sb(struct super_block *sb,
logfs_set_segment_erased(sb, segno, ec, 0);
logfs_write_ds(sb, ds, segno, ec);
err = super->s_devops->write_sb(sb, page);
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -366,24 +366,24 @@ static struct page *find_super_block(struct super_block *sb)
return NULL;
last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
if (!last || IS_ERR(last)) {
- page_cache_release(first);
+ put_page(first);
return NULL;
}
if (!logfs_check_ds(page_address(first))) {
- page_cache_release(last);
+ put_page(last);
return first;
}
/* First one didn't work, try the second superblock */
if (!logfs_check_ds(page_address(last))) {
- page_cache_release(first);
+ put_page(first);
return last;
}
/* Neither worked, sorry folks */
- page_cache_release(first);
- page_cache_release(last);
+ put_page(first);
+ put_page(last);
return NULL;
}
@@ -425,7 +425,7 @@ static int __logfs_read_sb(struct super_block *sb)
super->s_data_levels = ds->ds_data_levels;
super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
+ super->s_data_levels;
- page_cache_release(page);
+ put_page(page);
return 0;
}
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 187477ded6b3..eccda3a02de6 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -1,858 +1,433 @@
-/*
- * linux/fs/mbcache.c
- * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
- */
-
-/*
- * Filesystem Meta Information Block Cache (mbcache)
- *
- * The mbcache caches blocks of block devices that need to be located
- * by their device/block number, as well as by other criteria (such
- * as the block's contents).
- *
- * There can only be one cache entry in a cache per device and block number.
- * Additional indexes need not be unique in this sense. The number of
- * additional indexes (=other criteria) can be hardwired at compile time
- * or specified at cache create time.
- *
- * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
- * in the cache. A valid entry is in the main hash tables of the cache,
- * and may also be in the lru list. An invalid entry is not in any hashes
- * or lists.
- *
- * A valid cache entry is only in the lru list if no handles refer to it.
- * Invalid cache entries will be freed when the last handle to the cache
- * entry is released. Entries that cannot be freed immediately are put
- * back on the lru list.
- */
-
-/*
- * Lock descriptions and usage:
- *
- * Each hash chain of both the block and index hash tables now contains
- * a built-in lock used to serialize accesses to the hash chain.
- *
- * Accesses to global data structures mb_cache_list and mb_cache_lru_list
- * are serialized via the global spinlock mb_cache_spinlock.
- *
- * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
- * accesses to its local data, such as e_used and e_queued.
- *
- * Lock ordering:
- *
- * Each block hash chain's lock has the highest lock order, followed by an
- * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
- * lock), and mb_cach_spinlock, with the lowest order. While holding
- * either a block or index hash chain lock, a thread can acquire an
- * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
- *
- * Synchronization:
- *
- * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
- * index hash chian, it needs to lock the corresponding hash chain. For each
- * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
- * prevent either any simultaneous release or free on the entry and also
- * to serialize accesses to either the e_used or e_queued member of the entry.
- *
- * To avoid having a dangling reference to an already freed
- * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
- * block hash chain and also no longer being referenced, both e_used,
- * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is
- * first removed from a block hash chain.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <linux/hash.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
+#include <linux/spinlock.h>
#include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/list.h>
#include <linux/list_bl.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
#include <linux/mbcache.h>
-#include <linux/init.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/log2.h>
-
-#ifdef MB_CACHE_DEBUG
-# define mb_debug(f...) do { \
- printk(KERN_DEBUG f); \
- printk("\n"); \
- } while (0)
-#define mb_assert(c) do { if (!(c)) \
- printk(KERN_ERR "assertion " #c " failed\n"); \
- } while(0)
-#else
-# define mb_debug(f...) do { } while(0)
-# define mb_assert(c) do { } while(0)
-#endif
-#define mb_error(f...) do { \
- printk(KERN_ERR f); \
- printk("\n"); \
- } while(0)
-
-#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
-
-#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS)
-#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \
- (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
-
-static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
-static struct blockgroup_lock *mb_cache_bg_lock;
-static struct kmem_cache *mb_cache_kmem_cache;
-
-MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
-MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
-MODULE_LICENSE("GPL");
-
-EXPORT_SYMBOL(mb_cache_create);
-EXPORT_SYMBOL(mb_cache_shrink);
-EXPORT_SYMBOL(mb_cache_destroy);
-EXPORT_SYMBOL(mb_cache_entry_alloc);
-EXPORT_SYMBOL(mb_cache_entry_insert);
-EXPORT_SYMBOL(mb_cache_entry_release);
-EXPORT_SYMBOL(mb_cache_entry_free);
-EXPORT_SYMBOL(mb_cache_entry_get);
-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-EXPORT_SYMBOL(mb_cache_entry_find_first);
-EXPORT_SYMBOL(mb_cache_entry_find_next);
-#endif
/*
- * Global data: list of all mbcache's, lru list, and a spinlock for
- * accessing cache data structures on SMP machines. The lru list is
- * global across all mbcaches.
+ * Mbcache is a simple key-value store. Keys need not be unique, however
+ * key-value pairs are expected to be unique (we use this fact in
+ * mb_cache_entry_delete_block()).
+ *
+ * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
+ * They use hash of a block contents as a key and block number as a value.
+ * That's why keys need not be unique (different xattr blocks may end up having
+ * the same hash). However block number always uniquely identifies a cache
+ * entry.
+ *
+ * We provide functions for creation and removal of entries, search by key,
+ * and a special "delete entry with given key-value pair" operation. Fixed
+ * size hash table is used for fast key lookups.
*/
-static LIST_HEAD(mb_cache_list);
-static LIST_HEAD(mb_cache_lru_list);
-static DEFINE_SPINLOCK(mb_cache_spinlock);
-
-static inline void
-__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
-{
- spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
- MB_CACHE_ENTRY_LOCK_INDEX(ce)));
-}
-
-static inline void
-__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
-{
- spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
- MB_CACHE_ENTRY_LOCK_INDEX(ce)));
-}
-
-static inline int
-__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
-{
- return !hlist_bl_unhashed(&ce->e_block_list);
-}
+struct mb_cache {
+ /* Hash table of entries */
+ struct hlist_bl_head *c_hash;
+ /* log2 of hash table size */
+ int c_bucket_bits;
+ /* Maximum entries in cache to avoid degrading hash too much */
+ int c_max_entries;
+ /* Protects c_list, c_entry_count */
+ spinlock_t c_list_lock;
+ struct list_head c_list;
+ /* Number of entries in cache */
+ unsigned long c_entry_count;
+ struct shrinker c_shrink;
+ /* Work for shrinking when the cache has too many entries */
+ struct work_struct c_shrink_work;
+};
+static struct kmem_cache *mb_entry_cache;
-static inline void
-__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
-{
- if (__mb_cache_entry_is_block_hashed(ce))
- hlist_bl_del_init(&ce->e_block_list);
-}
+static unsigned long mb_cache_shrink(struct mb_cache *cache,
+ unsigned int nr_to_scan);
-static inline int
-__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
+ u32 key)
{
- return !hlist_bl_unhashed(&ce->e_index.o_list);
+ return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
}
-static inline void
-__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
-{
- if (__mb_cache_entry_is_index_hashed(ce))
- hlist_bl_del_init(&ce->e_index.o_list);
-}
+/*
+ * Number of entries to reclaim synchronously when there are too many entries
+ * in cache
+ */
+#define SYNC_SHRINK_BATCH 64
/*
- * __mb_cache_entry_unhash_unlock()
- *
- * This function is called to unhash both the block and index hash
- * chain.
- * It assumes both the block and index hash chain is locked upon entry.
- * It also unlock both hash chains both exit
+ * mb_cache_entry_create - create entry in cache
+ * @cache - cache where the entry should be created
+ * @mask - gfp mask with which the entry should be allocated
+ * @key - key of the entry
+ * @block - block that contains data
+ * @reusable - is the block reusable by other inodes?
+ *
+ * Creates entry in @cache with key @key and records that data is stored in
+ * block @block. The function returns -EBUSY if entry with the same key
+ * and for the same block already exists in cache. Otherwise 0 is returned.
*/
-static inline void
-__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
+int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
+ sector_t block, bool reusable)
{
- __mb_cache_entry_unhash_index(ce);
- hlist_bl_unlock(ce->e_index_hash_p);
- __mb_cache_entry_unhash_block(ce);
- hlist_bl_unlock(ce->e_block_hash_p);
+ struct mb_cache_entry *entry, *dup;
+ struct hlist_bl_node *dup_node;
+ struct hlist_bl_head *head;
+
+ /* Schedule background reclaim if there are too many entries */
+ if (cache->c_entry_count >= cache->c_max_entries)
+ schedule_work(&cache->c_shrink_work);
+ /* Do some sync reclaim if background reclaim cannot keep up */
+ if (cache->c_entry_count >= 2*cache->c_max_entries)
+ mb_cache_shrink(cache, SYNC_SHRINK_BATCH);
+
+ entry = kmem_cache_alloc(mb_entry_cache, mask);
+ if (!entry)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&entry->e_list);
+ /* One ref for hash, one ref returned */
+ atomic_set(&entry->e_refcnt, 1);
+ entry->e_key = key;
+ entry->e_block = block;
+ entry->e_reusable = reusable;
+ head = mb_cache_entry_head(cache, key);
+ hlist_bl_lock(head);
+ hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
+ if (dup->e_key == key && dup->e_block == block) {
+ hlist_bl_unlock(head);
+ kmem_cache_free(mb_entry_cache, entry);
+ return -EBUSY;
+ }
+ }
+ hlist_bl_add_head(&entry->e_hash_list, head);
+ hlist_bl_unlock(head);
+
+ spin_lock(&cache->c_list_lock);
+ list_add_tail(&entry->e_list, &cache->c_list);
+ /* Grab ref for LRU list */
+ atomic_inc(&entry->e_refcnt);
+ cache->c_entry_count++;
+ spin_unlock(&cache->c_list_lock);
+
+ return 0;
}
+EXPORT_SYMBOL(mb_cache_entry_create);
-static void
-__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
+void __mb_cache_entry_free(struct mb_cache_entry *entry)
{
- struct mb_cache *cache = ce->e_cache;
-
- mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
- kmem_cache_free(cache->c_entry_cache, ce);
- atomic_dec(&cache->c_entry_count);
+ kmem_cache_free(mb_entry_cache, entry);
}
+EXPORT_SYMBOL(__mb_cache_entry_free);
-static void
-__mb_cache_entry_release(struct mb_cache_entry *ce)
+static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
+ struct mb_cache_entry *entry,
+ u32 key)
{
- /* First lock the entry to serialize access to its local data. */
- __spin_lock_mb_cache_entry(ce);
- /* Wake up all processes queuing for this cache entry. */
- if (ce->e_queued)
- wake_up_all(&mb_cache_queue);
- if (ce->e_used >= MB_CACHE_WRITER)
- ce->e_used -= MB_CACHE_WRITER;
- /*
- * Make sure that all cache entries on lru_list have
- * both e_used and e_qued of 0s.
- */
- ce->e_used--;
- if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
- if (!__mb_cache_entry_is_block_hashed(ce)) {
- __spin_unlock_mb_cache_entry(ce);
- goto forget;
+ struct mb_cache_entry *old_entry = entry;
+ struct hlist_bl_node *node;
+ struct hlist_bl_head *head;
+
+ head = mb_cache_entry_head(cache, key);
+ hlist_bl_lock(head);
+ if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
+ node = entry->e_hash_list.next;
+ else
+ node = hlist_bl_first(head);
+ while (node) {
+ entry = hlist_bl_entry(node, struct mb_cache_entry,
+ e_hash_list);
+ if (entry->e_key == key && entry->e_reusable) {
+ atomic_inc(&entry->e_refcnt);
+ goto out;
}
- /*
- * Need access to lru list, first drop entry lock,
- * then reacquire the lock in the proper order.
- */
- spin_lock(&mb_cache_spinlock);
- if (list_empty(&ce->e_lru_list))
- list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
- spin_unlock(&mb_cache_spinlock);
+ node = node->next;
}
- __spin_unlock_mb_cache_entry(ce);
- return;
-forget:
- mb_assert(list_empty(&ce->e_lru_list));
- __mb_cache_entry_forget(ce, GFP_KERNEL);
+ entry = NULL;
+out:
+ hlist_bl_unlock(head);
+ if (old_entry)
+ mb_cache_entry_put(cache, old_entry);
+
+ return entry;
}
/*
- * mb_cache_shrink_scan() memory pressure callback
- *
- * This function is called by the kernel memory management when memory
- * gets low.
+ * mb_cache_entry_find_first - find the first entry in cache with given key
+ * @cache: cache where we should search
+ * @key: key to look for
*
- * @shrink: (ignored)
- * @sc: shrink_control passed from reclaim
- *
- * Returns the number of objects freed.
+ * Search in @cache for entry with key @key. Grabs reference to the first
+ * entry found and returns the entry.
*/
-static unsigned long
-mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
+ u32 key)
{
- LIST_HEAD(free_list);
- struct mb_cache_entry *entry, *tmp;
- int nr_to_scan = sc->nr_to_scan;
- gfp_t gfp_mask = sc->gfp_mask;
- unsigned long freed = 0;
-
- mb_debug("trying to free %d entries", nr_to_scan);
- spin_lock(&mb_cache_spinlock);
- while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
- struct mb_cache_entry *ce =
- list_entry(mb_cache_lru_list.next,
- struct mb_cache_entry, e_lru_list);
- list_del_init(&ce->e_lru_list);
- if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
- continue;
- spin_unlock(&mb_cache_spinlock);
- /* Prevent any find or get operation on the entry */
- hlist_bl_lock(ce->e_block_hash_p);
- hlist_bl_lock(ce->e_index_hash_p);
- /* Ignore if it is touched by a find/get */
- if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
- !list_empty(&ce->e_lru_list)) {
- hlist_bl_unlock(ce->e_index_hash_p);
- hlist_bl_unlock(ce->e_block_hash_p);
- spin_lock(&mb_cache_spinlock);
- continue;
- }
- __mb_cache_entry_unhash_unlock(ce);
- list_add_tail(&ce->e_lru_list, &free_list);
- spin_lock(&mb_cache_spinlock);
- }
- spin_unlock(&mb_cache_spinlock);
-
- list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
- __mb_cache_entry_forget(entry, gfp_mask);
- freed++;
- }
- return freed;
+ return __entry_find(cache, NULL, key);
}
+EXPORT_SYMBOL(mb_cache_entry_find_first);
-static unsigned long
-mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+/*
+ * mb_cache_entry_find_next - find next entry in cache with the same
+ * @cache: cache where we should search
+ * @entry: entry to start search from
+ *
+ * Finds next entry in the hash chain which has the same key as @entry.
+ * If @entry is unhashed (which can happen when deletion of entry races
+ * with the search), finds the first entry in the hash chain. The function
+ * drops reference to @entry and returns with a reference to the found entry.
+ */
+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
+ struct mb_cache_entry *entry)
{
- struct mb_cache *cache;
- unsigned long count = 0;
-
- spin_lock(&mb_cache_spinlock);
- list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
- mb_debug("cache %s (%d)", cache->c_name,
- atomic_read(&cache->c_entry_count));
- count += atomic_read(&cache->c_entry_count);
- }
- spin_unlock(&mb_cache_spinlock);
-
- return vfs_pressure_ratio(count);
+ return __entry_find(cache, entry, entry->e_key);
}
-
-static struct shrinker mb_cache_shrinker = {
- .count_objects = mb_cache_shrink_count,
- .scan_objects = mb_cache_shrink_scan,
- .seeks = DEFAULT_SEEKS,
-};
+EXPORT_SYMBOL(mb_cache_entry_find_next);
/*
- * mb_cache_create() create a new cache
- *
- * All entries in one cache are equal size. Cache entries may be from
- * multiple devices. If this is the first mbcache created, registers
- * the cache with kernel memory management. Returns NULL if no more
- * memory was available.
- *
- * @name: name of the cache (informal)
- * @bucket_bits: log2(number of hash buckets)
+ * mb_cache_entry_get - get a cache entry by block number (and key)
+ * @cache - cache we work with
+ * @key - key of block number @block
+ * @block - block number
*/
-struct mb_cache *
-mb_cache_create(const char *name, int bucket_bits)
+struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
+ sector_t block)
{
- int n, bucket_count = 1 << bucket_bits;
- struct mb_cache *cache = NULL;
-
- if (!mb_cache_bg_lock) {
- mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
- GFP_KERNEL);
- if (!mb_cache_bg_lock)
- return NULL;
- bgl_lock_init(mb_cache_bg_lock);
- }
-
- cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
- if (!cache)
- return NULL;
- cache->c_name = name;
- atomic_set(&cache->c_entry_count, 0);
- cache->c_bucket_bits = bucket_bits;
- cache->c_block_hash = kmalloc(bucket_count *
- sizeof(struct hlist_bl_head), GFP_KERNEL);
- if (!cache->c_block_hash)
- goto fail;
- for (n=0; n<bucket_count; n++)
- INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
- cache->c_index_hash = kmalloc(bucket_count *
- sizeof(struct hlist_bl_head), GFP_KERNEL);
- if (!cache->c_index_hash)
- goto fail;
- for (n=0; n<bucket_count; n++)
- INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
- if (!mb_cache_kmem_cache) {
- mb_cache_kmem_cache = kmem_cache_create(name,
- sizeof(struct mb_cache_entry), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
- if (!mb_cache_kmem_cache)
- goto fail2;
+ struct hlist_bl_node *node;
+ struct hlist_bl_head *head;
+ struct mb_cache_entry *entry;
+
+ head = mb_cache_entry_head(cache, key);
+ hlist_bl_lock(head);
+ hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
+ if (entry->e_key == key && entry->e_block == block) {
+ atomic_inc(&entry->e_refcnt);
+ goto out;
+ }
}
- cache->c_entry_cache = mb_cache_kmem_cache;
-
- /*
- * Set an upper limit on the number of cache entries so that the hash
- * chains won't grow too long.
- */
- cache->c_max_entries = bucket_count << 4;
-
- spin_lock(&mb_cache_spinlock);
- list_add(&cache->c_cache_list, &mb_cache_list);
- spin_unlock(&mb_cache_spinlock);
- return cache;
-
-fail2:
- kfree(cache->c_index_hash);
-
-fail:
- kfree(cache->c_block_hash);
- kfree(cache);
- return NULL;
+ entry = NULL;
+out:
+ hlist_bl_unlock(head);
+ return entry;
}
+EXPORT_SYMBOL(mb_cache_entry_get);
-
-/*
- * mb_cache_shrink()
- *
- * Removes all cache entries of a device from the cache. All cache entries
- * currently in use cannot be freed, and thus remain in the cache. All others
- * are freed.
+/* mb_cache_entry_delete_block - remove information about block from cache
+ * @cache - cache we work with
+ * @key - key of block @block
+ * @block - block number
*
- * @bdev: which device's cache entries to shrink
+ * Remove entry from cache @cache with key @key with data stored in @block.
*/
-void
-mb_cache_shrink(struct block_device *bdev)
+void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
+ sector_t block)
{
- LIST_HEAD(free_list);
- struct list_head *l;
- struct mb_cache_entry *ce, *tmp;
-
- l = &mb_cache_lru_list;
- spin_lock(&mb_cache_spinlock);
- while (!list_is_last(l, &mb_cache_lru_list)) {
- l = l->next;
- ce = list_entry(l, struct mb_cache_entry, e_lru_list);
- if (ce->e_bdev == bdev) {
- list_del_init(&ce->e_lru_list);
- if (ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt))
- continue;
- spin_unlock(&mb_cache_spinlock);
- /*
- * Prevent any find or get operation on the entry.
- */
- hlist_bl_lock(ce->e_block_hash_p);
- hlist_bl_lock(ce->e_index_hash_p);
- /* Ignore if it is touched by a find/get */
- if (ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt) ||
- !list_empty(&ce->e_lru_list)) {
- hlist_bl_unlock(ce->e_index_hash_p);
- hlist_bl_unlock(ce->e_block_hash_p);
- l = &mb_cache_lru_list;
- spin_lock(&mb_cache_spinlock);
- continue;
+ struct hlist_bl_node *node;
+ struct hlist_bl_head *head;
+ struct mb_cache_entry *entry;
+
+ head = mb_cache_entry_head(cache, key);
+ hlist_bl_lock(head);
+ hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
+ if (entry->e_key == key && entry->e_block == block) {
+ /* We keep hash list reference to keep entry alive */
+ hlist_bl_del_init(&entry->e_hash_list);
+ hlist_bl_unlock(head);
+ spin_lock(&cache->c_list_lock);
+ if (!list_empty(&entry->e_list)) {
+ list_del_init(&entry->e_list);
+ cache->c_entry_count--;
+ atomic_dec(&entry->e_refcnt);
}
- __mb_cache_entry_unhash_unlock(ce);
- mb_assert(!(ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt)));
- list_add_tail(&ce->e_lru_list, &free_list);
- l = &mb_cache_lru_list;
- spin_lock(&mb_cache_spinlock);
+ spin_unlock(&cache->c_list_lock);
+ mb_cache_entry_put(cache, entry);
+ return;
}
}
- spin_unlock(&mb_cache_spinlock);
-
- list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
- __mb_cache_entry_forget(ce, GFP_KERNEL);
- }
+ hlist_bl_unlock(head);
}
+EXPORT_SYMBOL(mb_cache_entry_delete_block);
-
-/*
- * mb_cache_destroy()
+/* mb_cache_entry_touch - cache entry got used
+ * @cache - cache the entry belongs to
+ * @entry - entry that got used
*
- * Shrinks the cache to its minimum possible size (hopefully 0 entries),
- * and then destroys it. If this was the last mbcache, un-registers the
- * mbcache from kernel memory management.
+ * Marks entry as used to give hit higher chances of surviving in cache.
*/
-void
-mb_cache_destroy(struct mb_cache *cache)
+void mb_cache_entry_touch(struct mb_cache *cache,
+ struct mb_cache_entry *entry)
{
- LIST_HEAD(free_list);
- struct mb_cache_entry *ce, *tmp;
-
- spin_lock(&mb_cache_spinlock);
- list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
- if (ce->e_cache == cache)
- list_move_tail(&ce->e_lru_list, &free_list);
- }
- list_del(&cache->c_cache_list);
- spin_unlock(&mb_cache_spinlock);
-
- list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
- list_del_init(&ce->e_lru_list);
- /*
- * Prevent any find or get operation on the entry.
- */
- hlist_bl_lock(ce->e_block_hash_p);
- hlist_bl_lock(ce->e_index_hash_p);
- mb_assert(!(ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt)));
- __mb_cache_entry_unhash_unlock(ce);
- __mb_cache_entry_forget(ce, GFP_KERNEL);
- }
-
- if (atomic_read(&cache->c_entry_count) > 0) {
- mb_error("cache %s: %d orphaned entries",
- cache->c_name,
- atomic_read(&cache->c_entry_count));
- }
-
- if (list_empty(&mb_cache_list)) {
- kmem_cache_destroy(mb_cache_kmem_cache);
- mb_cache_kmem_cache = NULL;
- }
- kfree(cache->c_index_hash);
- kfree(cache->c_block_hash);
- kfree(cache);
+ entry->e_referenced = 1;
}
+EXPORT_SYMBOL(mb_cache_entry_touch);
-/*
- * mb_cache_entry_alloc()
- *
- * Allocates a new cache entry. The new entry will not be valid initially,
- * and thus cannot be looked up yet. It should be filled with data, and
- * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
- * if no more memory was available.
- */
-struct mb_cache_entry *
-mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
+static unsigned long mb_cache_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- struct mb_cache_entry *ce;
-
- if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
- struct list_head *l;
-
- l = &mb_cache_lru_list;
- spin_lock(&mb_cache_spinlock);
- while (!list_is_last(l, &mb_cache_lru_list)) {
- l = l->next;
- ce = list_entry(l, struct mb_cache_entry, e_lru_list);
- if (ce->e_cache == cache) {
- list_del_init(&ce->e_lru_list);
- if (ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt))
- continue;
- spin_unlock(&mb_cache_spinlock);
- /*
- * Prevent any find or get operation on the
- * entry.
- */
- hlist_bl_lock(ce->e_block_hash_p);
- hlist_bl_lock(ce->e_index_hash_p);
- /* Ignore if it is touched by a find/get */
- if (ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt) ||
- !list_empty(&ce->e_lru_list)) {
- hlist_bl_unlock(ce->e_index_hash_p);
- hlist_bl_unlock(ce->e_block_hash_p);
- l = &mb_cache_lru_list;
- spin_lock(&mb_cache_spinlock);
- continue;
- }
- mb_assert(list_empty(&ce->e_lru_list));
- mb_assert(!(ce->e_used || ce->e_queued ||
- atomic_read(&ce->e_refcnt)));
- __mb_cache_entry_unhash_unlock(ce);
- goto found;
- }
- }
- spin_unlock(&mb_cache_spinlock);
- }
+ struct mb_cache *cache = container_of(shrink, struct mb_cache,
+ c_shrink);
- ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
- if (!ce)
- return NULL;
- atomic_inc(&cache->c_entry_count);
- INIT_LIST_HEAD(&ce->e_lru_list);
- INIT_HLIST_BL_NODE(&ce->e_block_list);
- INIT_HLIST_BL_NODE(&ce->e_index.o_list);
- ce->e_cache = cache;
- ce->e_queued = 0;
- atomic_set(&ce->e_refcnt, 0);
-found:
- ce->e_block_hash_p = &cache->c_block_hash[0];
- ce->e_index_hash_p = &cache->c_index_hash[0];
- ce->e_used = 1 + MB_CACHE_WRITER;
- return ce;
+ return cache->c_entry_count;
}
-
-/*
- * mb_cache_entry_insert()
- *
- * Inserts an entry that was allocated using mb_cache_entry_alloc() into
- * the cache. After this, the cache entry can be looked up, but is not yet
- * in the lru list as the caller still holds a handle to it. Returns 0 on
- * success, or -EBUSY if a cache entry for that device + inode exists
- * already (this may happen after a failed lookup, but when another process
- * has inserted the same cache entry in the meantime).
- *
- * @bdev: device the cache entry belongs to
- * @block: block number
- * @key: lookup key
- */
-int
-mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
- sector_t block, unsigned int key)
+/* Shrink number of entries in cache */
+static unsigned long mb_cache_shrink(struct mb_cache *cache,
+ unsigned int nr_to_scan)
{
- struct mb_cache *cache = ce->e_cache;
- unsigned int bucket;
- struct hlist_bl_node *l;
- struct hlist_bl_head *block_hash_p;
- struct hlist_bl_head *index_hash_p;
- struct mb_cache_entry *lce;
-
- mb_assert(ce);
- bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
- cache->c_bucket_bits);
- block_hash_p = &cache->c_block_hash[bucket];
- hlist_bl_lock(block_hash_p);
- hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
- if (lce->e_bdev == bdev && lce->e_block == block) {
- hlist_bl_unlock(block_hash_p);
- return -EBUSY;
+ struct mb_cache_entry *entry;
+ struct hlist_bl_head *head;
+ unsigned int shrunk = 0;
+
+ spin_lock(&cache->c_list_lock);
+ while (nr_to_scan-- && !list_empty(&cache->c_list)) {
+ entry = list_first_entry(&cache->c_list,
+ struct mb_cache_entry, e_list);
+ if (entry->e_referenced) {
+ entry->e_referenced = 0;
+ list_move_tail(&cache->c_list, &entry->e_list);
+ continue;
}
+ list_del_init(&entry->e_list);
+ cache->c_entry_count--;
+ /*
+ * We keep LRU list reference so that entry doesn't go away
+ * from under us.
+ */
+ spin_unlock(&cache->c_list_lock);
+ head = mb_cache_entry_head(cache, entry->e_key);
+ hlist_bl_lock(head);
+ if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+ hlist_bl_del_init(&entry->e_hash_list);
+ atomic_dec(&entry->e_refcnt);
+ }
+ hlist_bl_unlock(head);
+ if (mb_cache_entry_put(cache, entry))
+ shrunk++;
+ cond_resched();
+ spin_lock(&cache->c_list_lock);
}
- mb_assert(!__mb_cache_entry_is_block_hashed(ce));
- __mb_cache_entry_unhash_block(ce);
- __mb_cache_entry_unhash_index(ce);
- ce->e_bdev = bdev;
- ce->e_block = block;
- ce->e_block_hash_p = block_hash_p;
- ce->e_index.o_key = key;
- hlist_bl_add_head(&ce->e_block_list, block_hash_p);
- hlist_bl_unlock(block_hash_p);
- bucket = hash_long(key, cache->c_bucket_bits);
- index_hash_p = &cache->c_index_hash[bucket];
- hlist_bl_lock(index_hash_p);
- ce->e_index_hash_p = index_hash_p;
- hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
- hlist_bl_unlock(index_hash_p);
- return 0;
-}
+ spin_unlock(&cache->c_list_lock);
+ return shrunk;
+}
-/*
- * mb_cache_entry_release()
- *
- * Release a handle to a cache entry. When the last handle to a cache entry
- * is released it is either freed (if it is invalid) or otherwise inserted
- * in to the lru list.
- */
-void
-mb_cache_entry_release(struct mb_cache_entry *ce)
+static unsigned long mb_cache_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- __mb_cache_entry_release(ce);
+ int nr_to_scan = sc->nr_to_scan;
+ struct mb_cache *cache = container_of(shrink, struct mb_cache,
+ c_shrink);
+ return mb_cache_shrink(cache, nr_to_scan);
}
+/* We shrink 1/X of the cache when we have too many entries in it */
+#define SHRINK_DIVISOR 16
-/*
- * mb_cache_entry_free()
- *
- */
-void
-mb_cache_entry_free(struct mb_cache_entry *ce)
+static void mb_cache_shrink_worker(struct work_struct *work)
{
- mb_assert(ce);
- mb_assert(list_empty(&ce->e_lru_list));
- hlist_bl_lock(ce->e_index_hash_p);
- __mb_cache_entry_unhash_index(ce);
- hlist_bl_unlock(ce->e_index_hash_p);
- hlist_bl_lock(ce->e_block_hash_p);
- __mb_cache_entry_unhash_block(ce);
- hlist_bl_unlock(ce->e_block_hash_p);
- __mb_cache_entry_release(ce);
+ struct mb_cache *cache = container_of(work, struct mb_cache,
+ c_shrink_work);
+ mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
}
-
/*
- * mb_cache_entry_get()
+ * mb_cache_create - create cache
+ * @bucket_bits: log2 of the hash table size
*
- * Get a cache entry by device / block number. (There can only be one entry
- * in the cache per device and block.) Returns NULL if no such cache entry
- * exists. The returned cache entry is locked for exclusive access ("single
- * writer").
+ * Create cache for keys with 2^bucket_bits hash entries.
*/
-struct mb_cache_entry *
-mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
- sector_t block)
+struct mb_cache *mb_cache_create(int bucket_bits)
{
- unsigned int bucket;
- struct hlist_bl_node *l;
- struct mb_cache_entry *ce;
- struct hlist_bl_head *block_hash_p;
-
- bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
- cache->c_bucket_bits);
- block_hash_p = &cache->c_block_hash[bucket];
- /* First serialize access to the block corresponding hash chain. */
- hlist_bl_lock(block_hash_p);
- hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
- mb_assert(ce->e_block_hash_p == block_hash_p);
- if (ce->e_bdev == bdev && ce->e_block == block) {
- /*
- * Prevent a free from removing the entry.
- */
- atomic_inc(&ce->e_refcnt);
- hlist_bl_unlock(block_hash_p);
- __spin_lock_mb_cache_entry(ce);
- atomic_dec(&ce->e_refcnt);
- if (ce->e_used > 0) {
- DEFINE_WAIT(wait);
- while (ce->e_used > 0) {
- ce->e_queued++;
- prepare_to_wait(&mb_cache_queue, &wait,
- TASK_UNINTERRUPTIBLE);
- __spin_unlock_mb_cache_entry(ce);
- schedule();
- __spin_lock_mb_cache_entry(ce);
- ce->e_queued--;
- }
- finish_wait(&mb_cache_queue, &wait);
- }
- ce->e_used += 1 + MB_CACHE_WRITER;
- __spin_unlock_mb_cache_entry(ce);
+ struct mb_cache *cache;
+ int bucket_count = 1 << bucket_bits;
+ int i;
- if (!list_empty(&ce->e_lru_list)) {
- spin_lock(&mb_cache_spinlock);
- list_del_init(&ce->e_lru_list);
- spin_unlock(&mb_cache_spinlock);
- }
- if (!__mb_cache_entry_is_block_hashed(ce)) {
- __mb_cache_entry_release(ce);
- return NULL;
- }
- return ce;
- }
+ if (!try_module_get(THIS_MODULE))
+ return NULL;
+
+ cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
+ if (!cache)
+ goto err_out;
+ cache->c_bucket_bits = bucket_bits;
+ cache->c_max_entries = bucket_count << 4;
+ INIT_LIST_HEAD(&cache->c_list);
+ spin_lock_init(&cache->c_list_lock);
+ cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
+ GFP_KERNEL);
+ if (!cache->c_hash) {
+ kfree(cache);
+ goto err_out;
}
- hlist_bl_unlock(block_hash_p);
- return NULL;
-}
+ for (i = 0; i < bucket_count; i++)
+ INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
-#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
+ cache->c_shrink.count_objects = mb_cache_count;
+ cache->c_shrink.scan_objects = mb_cache_scan;
+ cache->c_shrink.seeks = DEFAULT_SEEKS;
+ register_shrinker(&cache->c_shrink);
-static struct mb_cache_entry *
-__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
- struct block_device *bdev, unsigned int key)
-{
+ INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
- /* The index hash chain is alredy acquire by caller. */
- while (l != NULL) {
- struct mb_cache_entry *ce =
- hlist_bl_entry(l, struct mb_cache_entry,
- e_index.o_list);
- mb_assert(ce->e_index_hash_p == head);
- if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
- /*
- * Prevent a free from removing the entry.
- */
- atomic_inc(&ce->e_refcnt);
- hlist_bl_unlock(head);
- __spin_lock_mb_cache_entry(ce);
- atomic_dec(&ce->e_refcnt);
- ce->e_used++;
- /* Incrementing before holding the lock gives readers
- priority over writers. */
- if (ce->e_used >= MB_CACHE_WRITER) {
- DEFINE_WAIT(wait);
-
- while (ce->e_used >= MB_CACHE_WRITER) {
- ce->e_queued++;
- prepare_to_wait(&mb_cache_queue, &wait,
- TASK_UNINTERRUPTIBLE);
- __spin_unlock_mb_cache_entry(ce);
- schedule();
- __spin_lock_mb_cache_entry(ce);
- ce->e_queued--;
- }
- finish_wait(&mb_cache_queue, &wait);
- }
- __spin_unlock_mb_cache_entry(ce);
- if (!list_empty(&ce->e_lru_list)) {
- spin_lock(&mb_cache_spinlock);
- list_del_init(&ce->e_lru_list);
- spin_unlock(&mb_cache_spinlock);
- }
- if (!__mb_cache_entry_is_block_hashed(ce)) {
- __mb_cache_entry_release(ce);
- return ERR_PTR(-EAGAIN);
- }
- return ce;
- }
- l = l->next;
- }
- hlist_bl_unlock(head);
+ return cache;
+
+err_out:
+ module_put(THIS_MODULE);
return NULL;
}
-
+EXPORT_SYMBOL(mb_cache_create);
/*
- * mb_cache_entry_find_first()
- *
- * Find the first cache entry on a given device with a certain key in
- * an additional index. Additional matches can be found with
- * mb_cache_entry_find_next(). Returns NULL if no match was found. The
- * returned cache entry is locked for shared access ("multiple readers").
+ * mb_cache_destroy - destroy cache
+ * @cache: the cache to destroy
*
- * @cache: the cache to search
- * @bdev: the device the cache entry should belong to
- * @key: the key in the index
+ * Free all entries in cache and cache itself. Caller must make sure nobody
+ * (except shrinker) can reach @cache when calling this.
*/
-struct mb_cache_entry *
-mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
- unsigned int key)
+void mb_cache_destroy(struct mb_cache *cache)
{
- unsigned int bucket = hash_long(key, cache->c_bucket_bits);
- struct hlist_bl_node *l;
- struct mb_cache_entry *ce = NULL;
- struct hlist_bl_head *index_hash_p;
-
- index_hash_p = &cache->c_index_hash[bucket];
- hlist_bl_lock(index_hash_p);
- if (!hlist_bl_empty(index_hash_p)) {
- l = hlist_bl_first(index_hash_p);
- ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
- } else
- hlist_bl_unlock(index_hash_p);
- return ce;
-}
+ struct mb_cache_entry *entry, *next;
+ unregister_shrinker(&cache->c_shrink);
-/*
- * mb_cache_entry_find_next()
- *
- * Find the next cache entry on a given device with a certain key in an
- * additional index. Returns NULL if no match could be found. The previous
- * entry is atomatically released, so that mb_cache_entry_find_next() can
- * be called like this:
- *
- * entry = mb_cache_entry_find_first();
- * while (entry) {
- * ...
- * entry = mb_cache_entry_find_next(entry, ...);
- * }
- *
- * @prev: The previous match
- * @bdev: the device the cache entry should belong to
- * @key: the key in the index
- */
-struct mb_cache_entry *
-mb_cache_entry_find_next(struct mb_cache_entry *prev,
- struct block_device *bdev, unsigned int key)
-{
- struct mb_cache *cache = prev->e_cache;
- unsigned int bucket = hash_long(key, cache->c_bucket_bits);
- struct hlist_bl_node *l;
- struct mb_cache_entry *ce;
- struct hlist_bl_head *index_hash_p;
-
- index_hash_p = &cache->c_index_hash[bucket];
- mb_assert(prev->e_index_hash_p == index_hash_p);
- hlist_bl_lock(index_hash_p);
- mb_assert(!hlist_bl_empty(index_hash_p));
- l = prev->e_index.o_list.next;
- ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
- __mb_cache_entry_release(prev);
- return ce;
+ /*
+ * We don't bother with any locking. Cache must not be used at this
+ * point.
+ */
+ list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
+ if (!hlist_bl_unhashed(&entry->e_hash_list)) {
+ hlist_bl_del_init(&entry->e_hash_list);
+ atomic_dec(&entry->e_refcnt);
+ } else
+ WARN_ON(1);
+ list_del(&entry->e_list);
+ WARN_ON(atomic_read(&entry->e_refcnt) != 1);
+ mb_cache_entry_put(cache, entry);
+ }
+ kfree(cache->c_hash);
+ kfree(cache);
+ module_put(THIS_MODULE);
}
+EXPORT_SYMBOL(mb_cache_destroy);
-#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
-
-static int __init init_mbcache(void)
+static int __init mbcache_init(void)
{
- register_shrinker(&mb_cache_shrinker);
+ mb_entry_cache = kmem_cache_create("mbcache",
+ sizeof(struct mb_cache_entry), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+ BUG_ON(!mb_entry_cache);
return 0;
}
-static void __exit exit_mbcache(void)
+static void __exit mbcache_exit(void)
{
- unregister_shrinker(&mb_cache_shrinker);
+ kmem_cache_destroy(mb_entry_cache);
}
-module_init(init_mbcache)
-module_exit(exit_mbcache)
+module_init(mbcache_init)
+module_exit(mbcache_exit)
+MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
+MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
+MODULE_LICENSE("GPL");
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d19ac258105a..33957c07cd11 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -28,7 +28,7 @@ const struct file_operations minix_dir_operations = {
static inline void dir_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -38,10 +38,10 @@ static inline void dir_put_page(struct page *page)
static unsigned
minix_last_byte(struct inode *inode, unsigned long page_nr)
{
- unsigned last_byte = PAGE_CACHE_SIZE;
+ unsigned last_byte = PAGE_SIZE;
- if (page_nr == (inode->i_size >> PAGE_CACHE_SHIFT))
- last_byte = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ if (page_nr == (inode->i_size >> PAGE_SHIFT))
+ last_byte = inode->i_size & (PAGE_SIZE - 1);
return last_byte;
}
@@ -92,8 +92,8 @@ static int minix_readdir(struct file *file, struct dir_context *ctx)
if (pos >= inode->i_size)
return 0;
- offset = pos & ~PAGE_CACHE_MASK;
- n = pos >> PAGE_CACHE_SHIFT;
+ offset = pos & ~PAGE_MASK;
+ n = pos >> PAGE_SHIFT;
for ( ; n < npages; n++, offset = 0) {
char *p, *kaddr, *limit;
@@ -229,7 +229,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
lock_page(page);
kaddr = (char*)page_address(page);
dir_end = kaddr + minix_last_byte(dir, n);
- limit = kaddr + PAGE_CACHE_SIZE - sbi->s_dirsize;
+ limit = kaddr + PAGE_SIZE - sbi->s_dirsize;
for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
de = (minix_dirent *)p;
de3 = (minix3_dirent *)p;
@@ -327,7 +327,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
}
kaddr = kmap_atomic(page);
- memset(kaddr, 0, PAGE_CACHE_SIZE);
+ memset(kaddr, 0, PAGE_SIZE);
if (sbi->s_version == MINIX_V3) {
minix3_dirent *de3 = (minix3_dirent *)kaddr;
@@ -350,7 +350,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 086cd0a61e80..f975d667c539 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -91,7 +91,7 @@ static int __init init_inodecache(void)
minix_inode_cachep = kmem_cache_create("minix_inode_cache",
sizeof(struct minix_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (minix_inode_cachep == NULL)
return -ENOMEM;
@@ -435,8 +435,7 @@ static const struct address_space_operations minix_aops = {
static const struct inode_operations minix_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = minix_getattr,
};
@@ -452,6 +451,7 @@ void minix_set_inode(struct inode *inode, dev_t rdev)
inode->i_mapping->a_ops = &minix_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &minix_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &minix_aops;
} else
init_special_inode(inode, inode->i_mode, rdev);
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 282e15ad8cd8..46ca39d6c735 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -24,16 +24,15 @@ static inline block_t *i_data(struct inode *inode)
static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
{
int n = 0;
- char b[BDEVNAME_SIZE];
if (block < 0) {
- printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
- block, bdevname(inode->i_sb->s_bdev, b));
+ printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
+ block, inode->i_sb->s_bdev);
} else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) {
if (printk_ratelimit())
printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %s\n",
- block, bdevname(inode->i_sb->s_bdev, b));
+ "block %ld too big on dev %pg\n",
+ block, inode->i_sb->s_bdev);
} else if (block < 7) {
offsets[n++] = block;
} else if ((block -= 7) < 512) {
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index 78e2d93e5c83..1ee101352586 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -26,18 +26,17 @@ static inline block_t *i_data(struct inode *inode)
static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
{
int n = 0;
- char b[BDEVNAME_SIZE];
struct super_block *sb = inode->i_sb;
if (block < 0) {
- printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
- block, bdevname(sb->s_bdev, b));
+ printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
+ block, sb->s_bdev);
} else if ((u64)block * (u64)sb->s_blocksize >=
minix_sb(sb)->s_max_size) {
if (printk_ratelimit())
printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %s\n",
- block, bdevname(sb->s_bdev, b));
+ "block %ld too big on dev %pg\n",
+ block, sb->s_bdev);
} else if (block < DIRCOUNT) {
offsets[n++] = block;
} else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index a795a11e50c7..2887d1d95ce2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -243,11 +243,11 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
return err;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index 09abba7653aa..eedc644b78d7 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -24,6 +24,7 @@
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
+#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
@@ -106,7 +107,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
* don't make any buffers if there is only one buffer on
* the page and the page just needs to be set up to date
*/
- if (inode->i_blkbits == PAGE_CACHE_SHIFT &&
+ if (inode->i_blkbits == PAGE_SHIFT &&
buffer_uptodate(bh)) {
SetPageUptodate(page);
return;
@@ -144,7 +145,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
{
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+ const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
sector_t block_in_file;
sector_t last_block;
@@ -161,7 +162,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
if (page_has_buffers(page))
goto confused;
- block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
@@ -248,7 +249,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
}
if (first_hole != blocks_per_page) {
- zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
+ zero_user_segment(page, first_hole << blkbits, PAGE_SIZE);
if (first_hole == 0) {
SetPageUptodate(page);
unlock_page(page);
@@ -330,7 +331,7 @@ confused:
*
* then this code just gives up and calls the buffer_head-based read function.
* It does handle a page which has holes at the end - that is a common case:
- * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
+ * the end-of-file on blocksize < PAGE_SIZE setups.
*
* BH_Boundary explanation:
*
@@ -361,12 +362,12 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
- gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(mapping);
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
map_bh.b_state = 0;
map_bh.b_size = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = list_entry(pages->prev, struct page, lru);
+ struct page *page = lru_to_page(pages);
prefetchw(&page->flags);
list_del(&page->lru);
@@ -379,7 +380,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
&first_logical_block,
get_block, gfp);
}
- page_cache_release(page);
+ put_page(page);
}
BUG_ON(!list_empty(pages));
if (bio)
@@ -397,7 +398,7 @@ int mpage_readpage(struct page *page, get_block_t get_block)
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
- gfp_t gfp = GFP_KERNEL & mapping_gfp_mask(page->mapping);
+ gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
map_bh.b_state = 0;
map_bh.b_size = 0;
@@ -471,7 +472,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
unsigned long end_index;
- const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+ const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
sector_t last_block;
sector_t block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE];
@@ -541,7 +542,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
* The page has no buffers: map it to disk
*/
BUG_ON(!PageUptodate(page));
- block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+ block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = (i_size - 1) >> blkbits;
map_bh.b_page = page;
for (page_block = 0; page_block < blocks_per_page; ) {
@@ -573,7 +574,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
first_unmapped = page_block;
page_is_mapped:
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = i_size >> PAGE_SHIFT;
if (page->index >= end_index) {
/*
* The page straddles i_size. It must be zeroed out on each
@@ -583,11 +584,11 @@ page_is_mapped:
* is zeroed when mapped, and writes to that region are not
* written out to the file."
*/
- unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
+ unsigned offset = i_size & (PAGE_SIZE - 1);
if (page->index > end_index || !offset)
goto confused;
- zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset, PAGE_SIZE);
}
/*
diff --git a/fs/namei.c b/fs/namei.c
index 6f567347f14f..1d9ca2d5dff6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -505,13 +505,13 @@ struct nameidata {
int total_link_count;
struct saved {
struct path link;
- void *cookie;
+ struct delayed_call done;
const char *name;
- struct inode *inode;
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
struct nameidata *saved;
+ struct inode *link_inode;
unsigned root_seq;
int dfd;
};
@@ -534,10 +534,8 @@ static void restore_nameidata(void)
current->nameidata = old;
if (old)
old->total_link_count = now->total_link_count;
- if (now->stack != now->internal) {
+ if (now->stack != now->internal)
kfree(now->stack);
- now->stack = now->internal;
- }
}
static int __nd_alloc_stack(struct nameidata *nd)
@@ -592,11 +590,8 @@ static void drop_links(struct nameidata *nd)
int i = nd->depth;
while (i--) {
struct saved *last = nd->stack + i;
- struct inode *inode = last->inode;
- if (last->cookie && inode->i_op->put_link) {
- inode->i_op->put_link(inode, last->cookie);
- last->cookie = NULL;
- }
+ do_delayed_call(&last->done);
+ clear_delayed_call(&last->done);
}
}
@@ -657,7 +652,7 @@ static bool legitimize_links(struct nameidata *nd)
* Path walking has 2 modes, rcu-walk and ref-walk (see
* Documentation/filesystems/path-lookup.txt). In situations when we can't
* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
- * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * normal reference counts on dentries and vfsmounts to transition to ref-walk
* mode. Refcounts are grabbed at the last known good point before rcu-walk
* got stuck, so ref-walk may continue from there. If this is not successful
* (eg. a seqcount has changed), then failure is returned and it's up to caller
@@ -807,19 +802,19 @@ static int complete_walk(struct nameidata *nd)
static void set_root(struct nameidata *nd)
{
- get_fs_root(current->fs, &nd->root);
-}
-
-static void set_root_rcu(struct nameidata *nd)
-{
struct fs_struct *fs = current->fs;
- unsigned seq;
- do {
- seq = read_seqcount_begin(&fs->seq);
- nd->root = fs->root;
- nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
+ if (nd->flags & LOOKUP_RCU) {
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&fs->seq);
+ nd->root = fs->root;
+ nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
+ } while (read_seqcount_retry(&fs->seq, seq));
+ } else {
+ get_fs_root(fs, &nd->root);
+ }
}
static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -841,8 +836,28 @@ static inline void path_to_nameidata(const struct path *path,
nd->path.dentry = path->dentry;
}
+static int nd_jump_root(struct nameidata *nd)
+{
+ if (nd->flags & LOOKUP_RCU) {
+ struct dentry *d;
+ nd->path = nd->root;
+ d = nd->path.dentry;
+ nd->inode = d->d_inode;
+ nd->seq = nd->root_seq;
+ if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
+ return -ECHILD;
+ } else {
+ path_put(&nd->path);
+ nd->path = nd->root;
+ path_get(&nd->path);
+ nd->inode = nd->path.dentry->d_inode;
+ }
+ nd->flags |= LOOKUP_JUMPED;
+ return 0;
+}
+
/*
- * Helper to directly jump to a known parsed path from ->follow_link,
+ * Helper to directly jump to a known parsed path from ->get_link,
* caller must have taken a reference to path beforehand.
*/
void nd_jump_link(struct path *path)
@@ -858,9 +873,7 @@ void nd_jump_link(struct path *path)
static inline void put_link(struct nameidata *nd)
{
struct saved *last = nd->stack + --nd->depth;
- struct inode *inode = last->inode;
- if (last->cookie && inode->i_op->put_link)
- inode->i_op->put_link(inode, last->cookie);
+ do_delayed_call(&last->done);
if (!(nd->flags & LOOKUP_RCU))
path_put(&last->link);
}
@@ -892,7 +905,7 @@ static inline int may_follow_link(struct nameidata *nd)
return 0;
/* Allowed if owner and follower match. */
- inode = nd->stack[0].inode;
+ inode = nd->link_inode;
if (uid_eq(current_cred()->fsuid, inode->i_uid))
return 0;
@@ -983,7 +996,7 @@ const char *get_link(struct nameidata *nd)
{
struct saved *last = nd->stack + nd->depth - 1;
struct dentry *dentry = last->link.dentry;
- struct inode *inode = last->inode;
+ struct inode *inode = nd->link_inode;
int error;
const char *res;
@@ -1004,36 +1017,27 @@ const char *get_link(struct nameidata *nd)
nd->last_type = LAST_BIND;
res = inode->i_link;
if (!res) {
+ const char * (*get)(struct dentry *, struct inode *,
+ struct delayed_call *);
+ get = inode->i_op->get_link;
if (nd->flags & LOOKUP_RCU) {
- if (unlikely(unlazy_walk(nd, NULL, 0)))
- return ERR_PTR(-ECHILD);
+ res = get(NULL, inode, &last->done);
+ if (res == ERR_PTR(-ECHILD)) {
+ if (unlikely(unlazy_walk(nd, NULL, 0)))
+ return ERR_PTR(-ECHILD);
+ res = get(dentry, inode, &last->done);
+ }
+ } else {
+ res = get(dentry, inode, &last->done);
}
- res = inode->i_op->follow_link(dentry, &last->cookie);
- if (IS_ERR_OR_NULL(res)) {
- last->cookie = NULL;
+ if (IS_ERR_OR_NULL(res))
return res;
- }
}
if (*res == '/') {
- if (nd->flags & LOOKUP_RCU) {
- struct dentry *d;
- if (!nd->root.mnt)
- set_root_rcu(nd);
- nd->path = nd->root;
- d = nd->path.dentry;
- nd->inode = d->d_inode;
- nd->seq = nd->root_seq;
- if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
- return ERR_PTR(-ECHILD);
- } else {
- if (!nd->root.mnt)
- set_root(nd);
- path_put(&nd->path);
- nd->path = nd->root;
- path_get(&nd->root);
- nd->inode = nd->path.dentry->d_inode;
- }
- nd->flags |= LOOKUP_JUMPED;
+ if (!nd->root.mnt)
+ set_root(nd);
+ if (unlikely(nd_jump_root(nd)))
+ return ERR_PTR(-ECHILD);
while (unlikely(*++res == '/'))
;
}
@@ -1216,8 +1220,8 @@ static int follow_managed(struct path *path, struct nameidata *nd)
if (need_mntput && path->mnt == mnt)
mntput(path->mnt);
- if (ret == -EISDIR)
- ret = 0;
+ if (ret == -EISDIR || !ret)
+ ret = 1;
if (need_mntput)
nd->flags |= LOOKUP_JUMPED;
if (unlikely(ret < 0))
@@ -1294,8 +1298,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
static int follow_dotdot_rcu(struct nameidata *nd)
{
struct inode *inode = nd->inode;
- if (!nd->root.mnt)
- set_root_rcu(nd);
while (1) {
if (path_equal(&nd->path, &nd->root))
@@ -1415,9 +1417,6 @@ static void follow_mount(struct path *path)
static int follow_dotdot(struct nameidata *nd)
{
- if (!nd->root.mnt)
- set_root(nd);
-
while(1) {
struct dentry *old = nd->path.dentry;
@@ -1445,40 +1444,26 @@ static int follow_dotdot(struct nameidata *nd)
* This looks up the name in dcache, possibly revalidates the old dentry and
* allocates a new one if not found or not valid. In the need_lookup argument
* returns whether i_op->lookup is necessary.
- *
- * dir->d_inode->i_mutex must be held
*/
-static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
- unsigned int flags, bool *need_lookup)
+static struct dentry *lookup_dcache(const struct qstr *name,
+ struct dentry *dir,
+ unsigned int flags)
{
struct dentry *dentry;
int error;
- *need_lookup = false;
dentry = d_lookup(dir, name);
if (dentry) {
if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
error = d_revalidate(dentry, flags);
if (unlikely(error <= 0)) {
- if (error < 0) {
- dput(dentry);
- return ERR_PTR(error);
- } else {
+ if (!error)
d_invalidate(dentry);
- dput(dentry);
- dentry = NULL;
- }
+ dput(dentry);
+ return ERR_PTR(error);
}
}
}
-
- if (!dentry) {
- dentry = d_alloc(dir, name);
- if (unlikely(!dentry))
- return ERR_PTR(-ENOMEM);
-
- *need_lookup = true;
- }
return dentry;
}
@@ -1507,45 +1492,44 @@ static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
return dentry;
}
-static struct dentry *__lookup_hash(struct qstr *name,
+static struct dentry *__lookup_hash(const struct qstr *name,
struct dentry *base, unsigned int flags)
{
- bool need_lookup;
- struct dentry *dentry;
+ struct dentry *dentry = lookup_dcache(name, base, flags);
- dentry = lookup_dcache(name, base, flags, &need_lookup);
- if (!need_lookup)
+ if (dentry)
return dentry;
+ dentry = d_alloc(base, name);
+ if (unlikely(!dentry))
+ return ERR_PTR(-ENOMEM);
+
return lookup_real(base->d_inode, dentry, flags);
}
-/*
- * It's more convoluted than I'd like it to be, but... it's still fairly
- * small and for now I'd prefer to have fast path as straight as possible.
- * It _is_ time-critical.
- */
static int lookup_fast(struct nameidata *nd,
struct path *path, struct inode **inode,
unsigned *seqp)
{
struct vfsmount *mnt = nd->path.mnt;
struct dentry *dentry, *parent = nd->path.dentry;
- int need_reval = 1;
int status = 1;
int err;
/*
* Rename seqlock is not required here because in the off chance
- * of a false negative due to a concurrent rename, we're going to
- * do the non-racy lookup, below.
+ * of a false negative due to a concurrent rename, the caller is
+ * going to fall back to non-racy lookup.
*/
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
bool negative;
dentry = __d_lookup_rcu(parent, &nd->last, &seq);
- if (!dentry)
- goto unlazy;
+ if (unlikely(!dentry)) {
+ if (unlazy_walk(nd, NULL, 0))
+ return -ECHILD;
+ return 0;
+ }
/*
* This sequence count validates that the inode matches
@@ -1553,7 +1537,7 @@ static int lookup_fast(struct nameidata *nd,
*/
*inode = d_backing_inode(dentry);
negative = d_is_negative(dentry);
- if (read_seqcount_retry(&dentry->d_seq, seq))
+ if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
return -ECHILD;
/*
@@ -1563,81 +1547,89 @@ static int lookup_fast(struct nameidata *nd,
* The memory barrier in read_seqcount_begin of child is
* enough, we can use __read_seqcount_retry here.
*/
- if (__read_seqcount_retry(&parent->d_seq, nd->seq))
+ if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
return -ECHILD;
*seqp = seq;
- if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+ if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
status = d_revalidate(dentry, nd->flags);
- if (unlikely(status <= 0)) {
- if (status != -ECHILD)
- need_reval = 0;
- goto unlazy;
- }
+ if (unlikely(status <= 0)) {
+ if (unlazy_walk(nd, dentry, seq))
+ return -ECHILD;
+ if (status == -ECHILD)
+ status = d_revalidate(dentry, nd->flags);
+ } else {
+ /*
+ * Note: do negative dentry check after revalidation in
+ * case that drops it.
+ */
+ if (unlikely(negative))
+ return -ENOENT;
+ path->mnt = mnt;
+ path->dentry = dentry;
+ if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
+ return 1;
+ if (unlazy_walk(nd, dentry, seq))
+ return -ECHILD;
}
- /*
- * Note: do negative dentry check after revalidation in
- * case that drops it.
- */
- if (negative)
- return -ENOENT;
- path->mnt = mnt;
- path->dentry = dentry;
- if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
- return 0;
-unlazy:
- if (unlazy_walk(nd, dentry, seq))
- return -ECHILD;
} else {
dentry = __d_lookup(parent, &nd->last);
+ if (unlikely(!dentry))
+ return 0;
+ if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
+ status = d_revalidate(dentry, nd->flags);
}
-
- if (unlikely(!dentry))
- goto need_lookup;
-
- if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
- status = d_revalidate(dentry, nd->flags);
if (unlikely(status <= 0)) {
- if (status < 0) {
- dput(dentry);
- return status;
- }
- d_invalidate(dentry);
+ if (!status)
+ d_invalidate(dentry);
dput(dentry);
- goto need_lookup;
+ return status;
}
-
if (unlikely(d_is_negative(dentry))) {
dput(dentry);
return -ENOENT;
}
+
path->mnt = mnt;
path->dentry = dentry;
err = follow_managed(path, nd);
- if (likely(!err))
+ if (likely(err > 0))
*inode = d_backing_inode(path->dentry);
return err;
-
-need_lookup:
- return 1;
}
/* Fast lookup failed, do it the slow way */
-static int lookup_slow(struct nameidata *nd, struct path *path)
+static struct dentry *lookup_slow(const struct qstr *name,
+ struct dentry *dir,
+ unsigned int flags)
{
- struct dentry *dentry, *parent;
-
- parent = nd->path.dentry;
- BUG_ON(nd->inode != parent->d_inode);
-
- mutex_lock(&parent->d_inode->i_mutex);
- dentry = __lookup_hash(&nd->last, parent, nd->flags);
- mutex_unlock(&parent->d_inode->i_mutex);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- path->mnt = nd->path.mnt;
- path->dentry = dentry;
- return follow_managed(path, nd);
+ struct dentry *dentry;
+ inode_lock(dir->d_inode);
+ dentry = d_lookup(dir, name);
+ if (unlikely(dentry)) {
+ if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
+ !(flags & LOOKUP_NO_REVAL)) {
+ int error = d_revalidate(dentry, flags);
+ if (unlikely(error <= 0)) {
+ if (!error)
+ d_invalidate(dentry);
+ dput(dentry);
+ dentry = ERR_PTR(error);
+ }
+ }
+ if (dentry) {
+ inode_unlock(dir->d_inode);
+ return dentry;
+ }
+ }
+ dentry = d_alloc(dir, name);
+ if (unlikely(!dentry)) {
+ inode_unlock(dir->d_inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ dentry = lookup_real(dir->d_inode, dentry, flags);
+ inode_unlock(dir->d_inode);
+ return dentry;
}
static inline int may_lookup(struct nameidata *nd)
@@ -1655,6 +1647,8 @@ static inline int may_lookup(struct nameidata *nd)
static inline int handle_dots(struct nameidata *nd, int type)
{
if (type == LAST_DOTDOT) {
+ if (!nd->root.mnt)
+ set_root(nd);
if (nd->flags & LOOKUP_RCU) {
return follow_dotdot_rcu(nd);
} else
@@ -1691,8 +1685,8 @@ static int pick_link(struct nameidata *nd, struct path *link,
last = nd->stack + nd->depth++;
last->link = *link;
- last->cookie = NULL;
- last->inode = inode;
+ clear_delayed_call(&last->done);
+ nd->link_inode = inode;
last->seq = seq;
return 1;
}
@@ -1711,6 +1705,11 @@ static inline int should_follow_link(struct nameidata *nd, struct path *link,
return 0;
if (!follow)
return 0;
+ /* make sure that d_is_symlink above matches inode */
+ if (nd->flags & LOOKUP_RCU) {
+ if (read_seqcount_retry(&link->dentry->d_seq, seq))
+ return -ECHILD;
+ }
return pick_link(nd, link, inode, seq);
}
@@ -1734,19 +1733,26 @@ static int walk_component(struct nameidata *nd, int flags)
return err;
}
err = lookup_fast(nd, &path, &inode, &seq);
- if (unlikely(err)) {
+ if (unlikely(err <= 0)) {
if (err < 0)
return err;
-
- err = lookup_slow(nd, &path);
- if (err < 0)
+ path.dentry = lookup_slow(&nd->last, nd->path.dentry,
+ nd->flags);
+ if (IS_ERR(path.dentry))
+ return PTR_ERR(path.dentry);
+
+ path.mnt = nd->path.mnt;
+ err = follow_managed(&path, nd);
+ if (unlikely(err < 0))
return err;
- inode = d_backing_inode(path.dentry);
+ if (unlikely(d_is_negative(path.dentry))) {
+ path_to_nameidata(&path, nd);
+ return -ENOENT;
+ }
+
seq = 0; /* we are already out of RCU mode */
- err = -ENOENT;
- if (d_is_negative(path.dentry))
- goto out_path_put;
+ inode = d_backing_inode(path.dentry);
}
if (flags & WALK_PUT)
@@ -1758,10 +1764,6 @@ static int walk_component(struct nameidata *nd, int flags)
nd->inode = inode;
nd->seq = seq;
return 0;
-
-out_path_put:
- path_to_nameidata(&path, nd);
- return err;
}
/*
@@ -1996,7 +1998,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
nd->depth = 0;
- nd->total_link_count = 0;
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
@@ -2021,18 +2022,19 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
nd->root.mnt = NULL;
+ nd->path.mnt = NULL;
+ nd->path.dentry = NULL;
nd->m_seq = read_seqbegin(&mount_lock);
if (*s == '/') {
- if (flags & LOOKUP_RCU) {
+ if (flags & LOOKUP_RCU)
rcu_read_lock();
- set_root_rcu(nd);
- nd->seq = nd->root_seq;
- } else {
- set_root(nd);
- path_get(&nd->root);
- }
- nd->path = nd->root;
+ set_root(nd);
+ if (likely(!nd_jump_root(nd)))
+ return s;
+ nd->root.mnt = NULL;
+ rcu_read_unlock();
+ return ERR_PTR(-ECHILD);
} else if (nd->dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
@@ -2043,11 +2045,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
do {
seq = read_seqcount_begin(&fs->seq);
nd->path = fs->pwd;
+ nd->inode = nd->path.dentry->d_inode;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
+ nd->inode = nd->path.dentry->d_inode;
}
+ return s;
} else {
/* Caller must check execute permissions on the starting path component */
struct fd f = fdget_raw(nd->dfd);
@@ -2077,16 +2082,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
fdput(f);
return s;
}
-
- nd->inode = nd->path.dentry->d_inode;
- if (!(flags & LOOKUP_RCU))
- return s;
- if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
- return s;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- rcu_read_unlock();
- return ERR_PTR(-ECHILD);
}
static const char *trailing_symlink(struct nameidata *nd)
@@ -2235,10 +2230,10 @@ struct dentry *kern_path_locked(const char *name, struct path *path)
putname(filename);
return ERR_PTR(-EINVAL);
}
- mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
d = __lookup_hash(&last, path->dentry, 0);
if (IS_ERR(d)) {
- mutex_unlock(&path->dentry->d_inode->i_mutex);
+ inode_unlock(path->dentry->d_inode);
path_put(path);
}
putname(filename);
@@ -2279,6 +2274,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
*
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2286,7 +2283,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
unsigned int c;
int err;
- WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+ WARN_ON_ONCE(!inode_is_locked(base->d_inode));
this.name = name;
this.len = len;
@@ -2322,6 +2319,63 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
}
EXPORT_SYMBOL(lookup_one_len);
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name: pathname component to lookup
+ * @base: base directory to lookup from
+ * @len: maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ struct qstr this;
+ unsigned int c;
+ int err;
+ struct dentry *ret;
+
+ this.name = name;
+ this.len = len;
+ this.hash = full_name_hash(name, len);
+ if (!len)
+ return ERR_PTR(-EACCES);
+
+ if (unlikely(name[0] == '.')) {
+ if (len < 2 || (len == 2 && name[1] == '.'))
+ return ERR_PTR(-EACCES);
+ }
+
+ while (len--) {
+ c = *(const unsigned char *)name++;
+ if (c == '/' || c == '\0')
+ return ERR_PTR(-EACCES);
+ }
+ /*
+ * See if the low-level filesystem might want
+ * to use its own hash..
+ */
+ if (base->d_flags & DCACHE_OP_HASH) {
+ int err = base->d_op->d_hash(base, &this);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ err = inode_permission(base->d_inode, MAY_EXEC);
+ if (err)
+ return ERR_PTR(err);
+
+ ret = lookup_dcache(&this, base, 0);
+ if (!ret)
+ ret = lookup_slow(&this, base, 0);
+ return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
@@ -2395,31 +2449,21 @@ mountpoint_last(struct nameidata *nd, struct path *path)
if (error)
return error;
dentry = dget(nd->path.dentry);
- goto done;
- }
-
- mutex_lock(&dir->d_inode->i_mutex);
- dentry = d_lookup(dir, &nd->last);
- if (!dentry) {
- /*
- * No cached dentry. Mounted dentries are pinned in the cache,
- * so that means that this dentry is probably a symlink or the
- * path doesn't actually point to a mounted dentry.
- */
- dentry = d_alloc(dir, &nd->last);
+ } else {
+ dentry = d_lookup(dir, &nd->last);
if (!dentry) {
- mutex_unlock(&dir->d_inode->i_mutex);
- return -ENOMEM;
- }
- dentry = lookup_real(dir->d_inode, dentry, nd->flags);
- if (IS_ERR(dentry)) {
- mutex_unlock(&dir->d_inode->i_mutex);
- return PTR_ERR(dentry);
+ /*
+ * No cached dentry. Mounted dentries are pinned in the
+ * cache, so that means that this dentry is probably
+ * a symlink or the path doesn't actually point
+ * to a mounted dentry.
+ */
+ dentry = lookup_slow(&nd->last, dir,
+ nd->flags | LOOKUP_NO_REVAL);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
}
}
- mutex_unlock(&dir->d_inode->i_mutex);
-
-done:
if (d_is_negative(dentry)) {
dput(dentry);
return -ENOENT;
@@ -2607,7 +2651,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
struct dentry *p;
if (p1 == p2) {
- mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
return NULL;
}
@@ -2615,29 +2659,29 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
p = d_ancestor(p2, p1);
if (p) {
- mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
return p;
}
p = d_ancestor(p1, p2);
if (p) {
- mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
return p;
}
- mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
return NULL;
}
EXPORT_SYMBOL(lock_rename);
void unlock_rename(struct dentry *p1, struct dentry *p2)
{
- mutex_unlock(&p1->d_inode->i_mutex);
+ inode_unlock(p1->d_inode);
if (p1 != p2) {
- mutex_unlock(&p2->d_inode->i_mutex);
+ inode_unlock(p2->d_inode);
mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
}
}
@@ -2670,10 +2714,6 @@ static int may_open(struct path *path, int acc_mode, int flag)
struct inode *inode = dentry->d_inode;
int error;
- /* O_PATH? */
- if (!acc_mode)
- return 0;
-
if (!inode)
return -ENOENT;
@@ -2695,7 +2735,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
break;
}
- error = inode_permission(inode, acc_mode);
+ error = inode_permission(inode, MAY_OPEN | acc_mode);
if (error)
return error;
@@ -2887,7 +2927,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
if (*opened & FILE_CREATED) {
WARN_ON(!(open_flag & O_CREAT));
fsnotify_create(dir, dentry);
- acc_mode = MAY_OPEN;
+ acc_mode = 0;
}
error = may_open(&file->f_path, acc_mode, open_flag);
if (error)
@@ -2952,16 +2992,22 @@ static int lookup_open(struct nameidata *nd, struct path *path,
struct inode *dir_inode = dir->d_inode;
struct dentry *dentry;
int error;
- bool need_lookup;
+ bool need_lookup = false;
*opened &= ~FILE_CREATED;
- dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
+ dentry = lookup_dcache(&nd->last, dir, nd->flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- /* Cached positive dentry: will open in f_op->open */
- if (!need_lookup && dentry->d_inode)
+ if (!dentry) {
+ dentry = d_alloc(dir, &nd->last);
+ if (unlikely(!dentry))
+ return -ENOMEM;
+ need_lookup = true;
+ } else if (dentry->d_inode) {
+ /* Cached positive dentry: will open in f_op->open */
goto out_no_open;
+ }
if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
return atomic_open(nd, dentry, path, file, op, got_write,
@@ -3045,13 +3091,14 @@ static int do_last(struct nameidata *nd,
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
/* we _can_ be in RCU mode here */
error = lookup_fast(nd, &path, &inode, &seq);
- if (likely(!error))
+ if (likely(error > 0))
goto finish_lookup;
if (error < 0)
return error;
BUG_ON(nd->inode != dir->d_inode);
+ BUG_ON(nd->flags & LOOKUP_RCU);
} else {
/* create side of things */
/*
@@ -3080,9 +3127,9 @@ retry_lookup:
* dropping this one anyway.
*/
}
- mutex_lock(&dir->d_inode->i_mutex);
+ inode_lock(dir->d_inode);
error = lookup_open(nd, &path, file, op, got_write, opened);
- mutex_unlock(&dir->d_inode->i_mutex);
+ inode_unlock(dir->d_inode);
if (error <= 0) {
if (error)
@@ -3100,18 +3147,12 @@ retry_lookup:
/* Don't check for write permission, don't truncate */
open_flag &= ~O_TRUNC;
will_truncate = false;
- acc_mode = MAY_OPEN;
+ acc_mode = 0;
path_to_nameidata(&path, nd);
goto finish_open_created;
}
/*
- * create/update audit record if it already exists.
- */
- if (d_is_positive(path.dentry))
- audit_inode(nd->name, path.dentry, 0);
-
- /*
* If atomic_open() acquired write access it is dropped now due to
* possible mount and symlink following (this might be optimized away if
* necessary...)
@@ -3121,6 +3162,16 @@ retry_lookup:
got_write = false;
}
+ if (unlikely(d_is_negative(path.dentry))) {
+ path_to_nameidata(&path, nd);
+ return -ENOENT;
+ }
+
+ /*
+ * create/update audit record if it already exists.
+ */
+ audit_inode(nd->name, path.dentry, 0);
+
if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
path_to_nameidata(&path, nd);
return -EEXIST;
@@ -3130,13 +3181,8 @@ retry_lookup:
if (unlikely(error < 0))
return error;
- BUG_ON(nd->flags & LOOKUP_RCU);
- inode = d_backing_inode(path.dentry);
seq = 0; /* out of RCU mode, so the value doesn't matter */
- if (unlikely(d_is_negative(path.dentry))) {
- path_to_nameidata(&path, nd);
- return -ENOENT;
- }
+ inode = d_backing_inode(path.dentry);
finish_lookup:
if (nd->depth)
put_link(nd);
@@ -3145,11 +3191,6 @@ finish_lookup:
if (unlikely(error))
return error;
- if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) {
- path_to_nameidata(&path, nd);
- return -ELOOP;
- }
-
if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) {
path_to_nameidata(&path, nd);
} else {
@@ -3168,6 +3209,10 @@ finish_open:
return error;
}
audit_inode(nd->name, nd->path.dentry, 0);
+ if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
+ error = -ELOOP;
+ goto out;
+ }
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
@@ -3184,10 +3229,11 @@ finish_open:
got_write = true;
}
finish_open_created:
- error = may_open(&nd->path, acc_mode, open_flag);
- if (error)
- goto out;
-
+ if (likely(!(open_flag & O_PATH))) {
+ error = may_open(&nd->path, acc_mode, open_flag);
+ if (error)
+ goto out;
+ }
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
error = vfs_open(&nd->path, file, current_cred());
if (!error) {
@@ -3211,6 +3257,10 @@ opened:
goto exit_fput;
}
out:
+ if (unlikely(error > 0)) {
+ WARN_ON(1);
+ error = -EINVAL;
+ }
if (got_write)
mnt_drop_write(nd->path.mnt);
path_put(&save_parent);
@@ -3274,7 +3324,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
goto out2;
audit_inode(nd->name, child, 0);
/* Don't check for other permissions, the inode was just created */
- error = may_open(&path, MAY_OPEN, op->open_flag);
+ error = may_open(&path, 0, op->open_flag);
if (error)
goto out2;
file->f_path.mnt = path.mnt;
@@ -3427,7 +3477,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
* Do the final lookup.
*/
lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
- mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path->dentry, lookup_flags);
if (IS_ERR(dentry))
goto unlock;
@@ -3456,7 +3506,7 @@ fail:
dput(dentry);
dentry = ERR_PTR(error);
unlock:
- mutex_unlock(&path->dentry->d_inode->i_mutex);
+ inode_unlock(path->dentry->d_inode);
if (!err2)
mnt_drop_write(path->mnt);
out:
@@ -3476,7 +3526,7 @@ EXPORT_SYMBOL(kern_path_create);
void done_path_create(struct path *path, struct dentry *dentry)
{
dput(dentry);
- mutex_unlock(&path->dentry->d_inode->i_mutex);
+ inode_unlock(path->dentry->d_inode);
mnt_drop_write(path->mnt);
path_put(path);
}
@@ -3637,31 +3687,6 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
return sys_mkdirat(AT_FDCWD, pathname, mode);
}
-/*
- * The dentry_unhash() helper will try to drop the dentry early: we
- * should have a usage count of 1 if we're the only user of this
- * dentry, and if that is true (possibly after pruning the dcache),
- * then we drop the dentry now.
- *
- * A low-level filesystem can, if it choses, legally
- * do a
- *
- * if (!d_unhashed(dentry))
- * return -EBUSY;
- *
- * if it cannot handle the case of removing a directory
- * that is still in use by something else..
- */
-void dentry_unhash(struct dentry *dentry)
-{
- shrink_dcache_parent(dentry);
- spin_lock(&dentry->d_lock);
- if (dentry->d_lockref.count == 1)
- __d_drop(dentry);
- spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(dentry_unhash);
-
int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
int error = may_delete(dir, dentry, 1);
@@ -3673,7 +3698,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
return -EPERM;
dget(dentry);
- mutex_lock(&dentry->d_inode->i_mutex);
+ inode_lock(dentry->d_inode);
error = -EBUSY;
if (is_local_mountpoint(dentry))
@@ -3693,7 +3718,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
detach_mounts(dentry);
out:
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
dput(dentry);
if (!error)
d_delete(dentry);
@@ -3732,7 +3757,7 @@ retry:
if (error)
goto exit1;
- mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
@@ -3748,7 +3773,7 @@ retry:
exit3:
dput(dentry);
exit2:
- mutex_unlock(&path.dentry->d_inode->i_mutex);
+ inode_unlock(path.dentry->d_inode);
mnt_drop_write(path.mnt);
exit1:
path_put(&path);
@@ -3794,7 +3819,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
if (!dir->i_op->unlink)
return -EPERM;
- mutex_lock(&target->i_mutex);
+ inode_lock(target);
if (is_local_mountpoint(dentry))
error = -EBUSY;
else {
@@ -3811,7 +3836,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
}
}
out:
- mutex_unlock(&target->i_mutex);
+ inode_unlock(target);
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
@@ -3854,7 +3879,7 @@ retry:
if (error)
goto exit1;
retry_deleg:
- mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
dentry = __lookup_hash(&last, path.dentry, lookup_flags);
error = PTR_ERR(dentry);
if (!IS_ERR(dentry)) {
@@ -3872,7 +3897,7 @@ retry_deleg:
exit2:
dput(dentry);
}
- mutex_unlock(&path.dentry->d_inode->i_mutex);
+ inode_unlock(path.dentry->d_inode);
if (inode)
iput(inode); /* truncate the inode here */
inode = NULL;
@@ -4024,7 +4049,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
if (error)
return error;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* Make sure we don't allow creating hardlink to an unlinked file */
if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
error = -ENOENT;
@@ -4041,7 +4066,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
inode->i_state &= ~I_LINKABLE;
spin_unlock(&inode->i_lock);
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!error)
fsnotify_link(dir, inode, new_dentry);
return error;
@@ -4241,7 +4266,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!is_dir || (flags & RENAME_EXCHANGE))
lock_two_nondirectories(source, target);
else if (target)
- mutex_lock(&target->i_mutex);
+ inode_lock(target);
error = -EBUSY;
if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
@@ -4294,7 +4319,7 @@ out:
if (!is_dir || (flags & RENAME_EXCHANGE))
unlock_two_nondirectories(source, target);
else if (target)
- mutex_unlock(&target->i_mutex);
+ inode_unlock(target);
dput(new_dentry);
if (!error) {
fsnotify_move(old_dir, new_dir, old_name, is_dir,
@@ -4496,72 +4521,73 @@ EXPORT_SYMBOL(readlink_copy);
/*
* A helper for ->readlink(). This should be used *ONLY* for symlinks that
- * have ->follow_link() touching nd only in nd_set_link(). Using (or not
- * using) it for any given inode is up to filesystem.
+ * have ->get_link() not calling nd_jump_link(). Using (or not using) it
+ * for any given inode is up to filesystem.
*/
int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
- void *cookie;
+ DEFINE_DELAYED_CALL(done);
struct inode *inode = d_inode(dentry);
const char *link = inode->i_link;
int res;
if (!link) {
- link = inode->i_op->follow_link(dentry, &cookie);
+ link = inode->i_op->get_link(dentry, inode, &done);
if (IS_ERR(link))
return PTR_ERR(link);
}
res = readlink_copy(buffer, buflen, link);
- if (inode->i_op->put_link)
- inode->i_op->put_link(inode, cookie);
+ do_delayed_call(&done);
return res;
}
EXPORT_SYMBOL(generic_readlink);
/* get the link contents into pagecache */
-static char *page_getlink(struct dentry * dentry, struct page **ppage)
+const char *page_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
{
char *kaddr;
struct page *page;
- struct address_space *mapping = dentry->d_inode->i_mapping;
- page = read_mapping_page(mapping, 0, NULL);
- if (IS_ERR(page))
- return (char*)page;
- *ppage = page;
- kaddr = kmap(page);
- nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+ struct address_space *mapping = inode->i_mapping;
+
+ if (!dentry) {
+ page = find_get_page(mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ page = read_mapping_page(mapping, 0, NULL);
+ if (IS_ERR(page))
+ return (char*)page;
+ }
+ set_delayed_call(callback, page_put_link, page);
+ BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
+ kaddr = page_address(page);
+ nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
return kaddr;
}
-int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
-{
- struct page *page = NULL;
- int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
- if (page) {
- kunmap(page);
- page_cache_release(page);
- }
- return res;
-}
-EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(page_get_link);
-const char *page_follow_link_light(struct dentry *dentry, void **cookie)
+void page_put_link(void *arg)
{
- struct page *page = NULL;
- char *res = page_getlink(dentry, &page);
- if (!IS_ERR(res))
- *cookie = page;
- return res;
+ put_page(arg);
}
-EXPORT_SYMBOL(page_follow_link_light);
+EXPORT_SYMBOL(page_put_link);
-void page_put_link(struct inode *unused, void *cookie)
+int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
- struct page *page = cookie;
- kunmap(page);
- page_cache_release(page);
+ DEFINE_DELAYED_CALL(done);
+ int res = readlink_copy(buffer, buflen,
+ page_get_link(dentry, d_inode(dentry),
+ &done));
+ do_delayed_call(&done);
+ return res;
}
-EXPORT_SYMBOL(page_put_link);
+EXPORT_SYMBOL(page_readlink);
/*
* The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4572,7 +4598,6 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
struct page *page;
void *fsdata;
int err;
- char *kaddr;
unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
if (nofs)
flags |= AOP_FLAG_NOFS;
@@ -4583,9 +4608,7 @@ retry:
if (err)
goto fail;
- kaddr = kmap_atomic(page);
- memcpy(kaddr, symname, len-1);
- kunmap_atomic(kaddr);
+ memcpy(page_address(page), symname, len-1);
err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
page, fsdata);
@@ -4604,13 +4627,12 @@ EXPORT_SYMBOL(__page_symlink);
int page_symlink(struct inode *inode, const char *symname, int len)
{
return __page_symlink(inode, symname, len,
- !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+ !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
}
EXPORT_SYMBOL(page_symlink);
const struct inode_operations page_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);
diff --git a/fs/namespace.c b/fs/namespace.c
index 0570729c87fd..4fb1691b4355 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1584,6 +1584,14 @@ static inline bool may_mount(void)
return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}
+static inline bool may_mandlock(void)
+{
+#ifndef CONFIG_MANDATORY_FILE_LOCKING
+ return false;
+#endif
+ return capable(CAP_SYS_ADMIN);
+}
+
/*
* Now umount can handle mount points as well as block devices.
* This is important for filesystems which use unnamed block devices.
@@ -1953,9 +1961,9 @@ static struct mountpoint *lock_mount(struct path *path)
struct vfsmount *mnt;
struct dentry *dentry = path->dentry;
retry:
- mutex_lock(&dentry->d_inode->i_mutex);
+ inode_lock(dentry->d_inode);
if (unlikely(cant_mount(dentry))) {
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
return ERR_PTR(-ENOENT);
}
namespace_lock();
@@ -1966,13 +1974,13 @@ retry:
mp = new_mountpoint(dentry);
if (IS_ERR(mp)) {
namespace_unlock();
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
return mp;
}
return mp;
}
namespace_unlock();
- mutex_unlock(&path->dentry->d_inode->i_mutex);
+ inode_unlock(path->dentry->d_inode);
path_put(path);
path->mnt = mnt;
dentry = path->dentry = dget(mnt->mnt_root);
@@ -1984,7 +1992,7 @@ static void unlock_mount(struct mountpoint *where)
struct dentry *dentry = where->m_dentry;
put_mountpoint(where);
namespace_unlock();
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
}
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
@@ -2601,18 +2609,18 @@ static long exact_copy_from_user(void *to, const void __user * from,
return n;
}
-int copy_mount_options(const void __user * data, unsigned long *where)
+void *copy_mount_options(const void __user * data)
{
int i;
- unsigned long page;
unsigned long size;
+ char *copy;
- *where = 0;
if (!data)
- return 0;
+ return NULL;
- if (!(page = __get_free_page(GFP_KERNEL)))
- return -ENOMEM;
+ copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!copy)
+ return ERR_PTR(-ENOMEM);
/* We only care that *some* data at the address the user
* gave us is valid. Just in case, we'll zero
@@ -2623,15 +2631,14 @@ int copy_mount_options(const void __user * data, unsigned long *where)
if (size > PAGE_SIZE)
size = PAGE_SIZE;
- i = size - exact_copy_from_user((void *)page, data, size);
+ i = size - exact_copy_from_user(copy, data, size);
if (!i) {
- free_page(page);
- return -EFAULT;
+ kfree(copy);
+ return ERR_PTR(-EFAULT);
}
if (i != PAGE_SIZE)
- memset((char *)page + i, 0, PAGE_SIZE - i);
- *where = page;
- return 0;
+ memset(copy + i, 0, PAGE_SIZE - i);
+ return copy;
}
char *copy_mount_string(const void __user *data)
@@ -2677,6 +2684,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
type_page, flags, data_page);
if (!retval && !may_mount())
retval = -EPERM;
+ if (!retval && (flags & MS_MANDLOCK) && !may_mandlock())
+ retval = -EPERM;
if (retval)
goto dput_out;
@@ -2896,7 +2905,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
int ret;
char *kernel_type;
char *kernel_dev;
- unsigned long data_page;
+ void *options;
kernel_type = copy_mount_string(type);
ret = PTR_ERR(kernel_type);
@@ -2908,14 +2917,14 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
if (IS_ERR(kernel_dev))
goto out_dev;
- ret = copy_mount_options(data, &data_page);
- if (ret < 0)
+ options = copy_mount_options(data);
+ ret = PTR_ERR(options);
+ if (IS_ERR(options))
goto out_data;
- ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
- (void *) data_page);
+ ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
- free_page(data_page);
+ kfree(options);
out_data:
kfree(kernel_dev);
out_dev:
@@ -2939,9 +2948,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}
-int path_is_under(struct path *path1, struct path *path2)
+bool path_is_under(struct path *path1, struct path *path2)
{
- int res;
+ bool res;
read_seqlock_excl(&mount_lock);
res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
read_sequnlock_excl(&mount_lock);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 356816e7bc90..bfdad003ee56 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -369,7 +369,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
if (!res) {
struct inode *inode = d_inode(dentry);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
ncp_new_dentry(dentry);
val=1;
@@ -377,7 +377,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
ncp_dbg(2, "found, but dirEntNum changed\n");
ncp_update_inode2(inode, &finfo);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
finished:
@@ -510,7 +510,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
kunmap(ctl.page);
SetPageUptodate(ctl.page);
unlock_page(ctl.page);
- page_cache_release(ctl.page);
+ put_page(ctl.page);
ctl.page = NULL;
}
ctl.idx = 0;
@@ -520,7 +520,7 @@ invalid_cache:
if (ctl.page) {
kunmap(ctl.page);
unlock_page(ctl.page);
- page_cache_release(ctl.page);
+ put_page(ctl.page);
ctl.page = NULL;
}
ctl.cache = cache;
@@ -554,14 +554,14 @@ finished:
kunmap(ctl.page);
SetPageUptodate(ctl.page);
unlock_page(ctl.page);
- page_cache_release(ctl.page);
+ put_page(ctl.page);
}
if (page) {
cache->head = ctl.head;
kunmap(page);
SetPageUptodate(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
out:
return result;
@@ -633,15 +633,15 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
d_rehash(newdent);
} else {
spin_lock(&dentry->d_lock);
- NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+ NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
spin_unlock(&dentry->d_lock);
}
} else {
struct inode *inode = d_inode(newdent);
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(inode, I_MUTEX_CHILD);
ncp_update_inode2(inode, entry);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
if (ctl.idx >= NCP_DIRCACHE_SIZE) {
@@ -649,7 +649,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
kunmap(ctl.page);
SetPageUptodate(ctl.page);
unlock_page(ctl.page);
- page_cache_release(ctl.page);
+ put_page(ctl.page);
}
ctl.cache = NULL;
ctl.idx -= NCP_DIRCACHE_SIZE;
@@ -1165,8 +1165,6 @@ out:
static int ncp_mknod(struct inode * dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
- if (!new_valid_dev(rdev))
- return -EINVAL;
if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
ncp_dbg(1, "mode = 0%ho\n", mode);
return ncp_create_new(dir, dentry, mode, rdev, 0);
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 011324ce9df2..dd38ca1f2ecb 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -224,10 +224,10 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
iocb->ki_pos = pos;
if (pos > i_size_read(inode)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (pos > i_size_read(inode))
i_size_write(inode, pos);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
ncp_dbg(1, "exit %pD2\n", file);
outrel:
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9605a2f63549..1af15fcbe57b 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -82,7 +82,7 @@ static int init_inodecache(void)
ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
sizeof(struct ncp_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ncp_inode_cachep == NULL)
return -ENOMEM;
@@ -244,8 +244,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
static const struct inode_operations ncp_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = ncp_notify_change,
};
#endif
@@ -283,6 +282,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &ncp_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &ncp_symlink_aops;
#endif
} else {
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 79b113048eac..0a3f9b594602 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -525,6 +525,8 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
switch (rqdata.cmd) {
case NCP_LOCK_EX:
case NCP_LOCK_SH:
+ if (rqdata.timeout < 0)
+ return -EINVAL;
if (rqdata.timeout == 0)
rqdata.timeout = NCP_LOCK_DEFAULT_TIMEOUT;
else if (rqdata.timeout > NCP_LOCK_MAX_TIMEOUT)
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 5233fbc1747a..17cfb743b5bf 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -191,7 +191,7 @@ struct ncp_cache_head {
int eof;
};
-#define NCP_DIRCACHE_SIZE ((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *)))
+#define NCP_DIRCACHE_SIZE ((int)(PAGE_SIZE/sizeof(struct dentry *)))
union ncp_dir_cache {
struct ncp_cache_head head;
struct dentry *dentry[NCP_DIRCACHE_SIZE];
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9cd4eb3a1e22..17a42e4eb872 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -229,9 +229,9 @@ bl_read_pagelist(struct nfs_pgio_header *header)
struct parallel_io *par;
loff_t f_offset = header->args.offset;
size_t bytes_left = header->args.count;
- unsigned int pg_offset, pg_len;
+ unsigned int pg_offset = header->args.pgbase, pg_len;
struct page **pages = header->args.pages;
- int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+ int pg_index = header->args.pgbase >> PAGE_SHIFT;
const bool is_dio = (header->dreq != NULL);
struct blk_plug plug;
int i;
@@ -262,20 +262,16 @@ bl_read_pagelist(struct nfs_pgio_header *header)
extent_length = be.be_length - (isect - be.be_f_offset);
}
- pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (is_dio) {
- if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
- pg_len = PAGE_CACHE_SIZE - pg_offset;
+ if (pg_offset + bytes_left > PAGE_SIZE)
+ pg_len = PAGE_SIZE - pg_offset;
else
pg_len = bytes_left;
} else {
BUG_ON(pg_offset != 0);
- pg_len = PAGE_CACHE_SIZE;
+ pg_len = PAGE_SIZE;
}
- isect += (pg_offset >> SECTOR_SHIFT);
- extent_length -= (pg_offset >> SECTOR_SHIFT);
-
if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
@@ -301,6 +297,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
extent_length -= (pg_len >> SECTOR_SHIFT);
f_offset += pg_len;
bytes_left -= pg_len;
+ pg_offset = 0;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
header->res.eof = 1;
@@ -342,9 +339,9 @@ static void bl_write_cleanup(struct work_struct *work)
if (likely(!hdr->pnfs_error)) {
struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
- u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
+ u64 start = hdr->args.offset & (loff_t)PAGE_MASK;
u64 end = (hdr->args.offset + hdr->args.count +
- PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
+ PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
(end - start) >> SECTOR_SHIFT);
@@ -376,7 +373,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
loff_t offset = header->args.offset;
size_t count = header->args.count;
struct page **pages = header->args.pages;
- int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+ int pg_index = header->args.pgbase >> PAGE_SHIFT;
unsigned int pg_len;
struct blk_plug plug;
int i;
@@ -395,7 +392,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
blk_start_plug(&plug);
/* we always write out the whole page */
- offset = offset & (loff_t)PAGE_CACHE_MASK;
+ offset = offset & (loff_t)PAGE_MASK;
isect = offset >> SECTOR_SHIFT;
for (i = pg_index; i < header->page_array.npages; i++) {
@@ -411,7 +408,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
extent_length = be.be_length - (isect - be.be_f_offset);
}
- pg_len = PAGE_CACHE_SIZE;
+ pg_len = PAGE_SIZE;
bio = do_add_page_to_bio(bio, header->page_array.npages - i,
WRITE, isect, pages[i], &map, &be,
bl_end_io_write, par,
@@ -449,8 +446,8 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
kfree(bl);
}
-static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
- gfp_t gfp_flags)
+static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags, bool is_scsi_layout)
{
struct pnfs_block_layout *bl;
@@ -463,9 +460,22 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
bl->bl_ext_ro = RB_ROOT;
spin_lock_init(&bl->bl_ext_lock);
+ bl->bl_scsi_layout = is_scsi_layout;
return &bl->bl_layout;
}
+static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ return __bl_alloc_layout_hdr(inode, gfp_flags, false);
+}
+
+static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
+ gfp_t gfp_flags)
+{
+ return __bl_alloc_layout_hdr(inode, gfp_flags, true);
+}
+
static void bl_free_lseg(struct pnfs_layout_segment *lseg)
{
dprintk("%s enter\n", __func__);
@@ -746,7 +756,7 @@ bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
static bool
is_aligned_req(struct nfs_pageio_descriptor *pgio,
- struct nfs_page *req, unsigned int alignment)
+ struct nfs_page *req, unsigned int alignment, bool is_write)
{
/*
* Always accept buffered writes, higher layers take care of the
@@ -761,7 +771,8 @@ is_aligned_req(struct nfs_pageio_descriptor *pgio,
if (IS_ALIGNED(req->wb_bytes, alignment))
return true;
- if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
+ if (is_write &&
+ (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode))) {
/*
* If the write goes up to the inode size, just write
* the full page. Data past the inode size is
@@ -778,7 +789,7 @@ is_aligned_req(struct nfs_pageio_descriptor *pgio,
static void
bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
- if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE, false)) {
nfs_pageio_reset_read_mds(pgio);
return;
}
@@ -794,7 +805,7 @@ static size_t
bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (!is_aligned_req(pgio, req, SECTOR_SIZE))
+ if (!is_aligned_req(pgio, req, SECTOR_SIZE, false))
return 0;
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -809,7 +820,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
pgoff_t end;
/* Optimize common case that writes from 0 to end of file */
- end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+ end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (end != inode->i_mapping->nrpages) {
rcu_read_lock();
end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
@@ -817,9 +828,9 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
}
if (!end)
- return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT);
+ return i_size_read(inode) - (idx << PAGE_SHIFT);
else
- return (end - idx) << PAGE_CACHE_SHIFT;
+ return (end - idx) << PAGE_SHIFT;
}
static void
@@ -827,7 +838,7 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
u64 wb_size;
- if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
+ if (!is_aligned_req(pgio, req, PAGE_SIZE, true)) {
nfs_pageio_reset_write_mds(pgio);
return;
}
@@ -849,7 +860,7 @@ static size_t
bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (!is_aligned_req(pgio, req, PAGE_SIZE))
+ if (!is_aligned_req(pgio, req, PAGE_SIZE, true))
return 0;
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -891,22 +902,53 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
.sync = pnfs_generic_sync,
};
+static struct pnfs_layoutdriver_type scsilayout_type = {
+ .id = LAYOUT_SCSI,
+ .name = "LAYOUT_SCSI",
+ .owner = THIS_MODULE,
+ .flags = PNFS_LAYOUTRET_ON_SETATTR |
+ PNFS_READ_WHOLE_PAGE,
+ .read_pagelist = bl_read_pagelist,
+ .write_pagelist = bl_write_pagelist,
+ .alloc_layout_hdr = sl_alloc_layout_hdr,
+ .free_layout_hdr = bl_free_layout_hdr,
+ .alloc_lseg = bl_alloc_lseg,
+ .free_lseg = bl_free_lseg,
+ .return_range = bl_return_range,
+ .prepare_layoutcommit = bl_prepare_layoutcommit,
+ .cleanup_layoutcommit = bl_cleanup_layoutcommit,
+ .set_layoutdriver = bl_set_layoutdriver,
+ .alloc_deviceid_node = bl_alloc_deviceid_node,
+ .free_deviceid_node = bl_free_deviceid_node,
+ .pg_read_ops = &bl_pg_read_ops,
+ .pg_write_ops = &bl_pg_write_ops,
+ .sync = pnfs_generic_sync,
+};
+
+
static int __init nfs4blocklayout_init(void)
{
int ret;
dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
- ret = pnfs_register_layoutdriver(&blocklayout_type);
+ ret = bl_init_pipefs();
if (ret)
goto out;
- ret = bl_init_pipefs();
+
+ ret = pnfs_register_layoutdriver(&blocklayout_type);
if (ret)
- goto out_unregister;
+ goto out_cleanup_pipe;
+
+ ret = pnfs_register_layoutdriver(&scsilayout_type);
+ if (ret)
+ goto out_unregister_block;
return 0;
-out_unregister:
+out_unregister_block:
pnfs_unregister_layoutdriver(&blocklayout_type);
+out_cleanup_pipe:
+ bl_cleanup_pipefs();
out:
return ret;
}
@@ -916,8 +958,9 @@ static void __exit nfs4blocklayout_exit(void)
dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
__func__);
- bl_cleanup_pipefs();
+ pnfs_unregister_layoutdriver(&scsilayout_type);
pnfs_unregister_layoutdriver(&blocklayout_type);
+ bl_cleanup_pipefs();
}
MODULE_ALIAS("nfs-layouttype4-3");
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index c556640dcf3b..18e6fd0b9506 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -40,8 +40,8 @@
#include "../pnfs.h"
#include "../netns.h"
-#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
-#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+#define PAGE_CACHE_SECTORS (PAGE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
struct pnfs_block_dev;
@@ -55,7 +55,6 @@ struct pnfs_block_dev;
*/
#define PNFS_BLOCK_UUID_LEN 128
-
struct pnfs_block_volume {
enum pnfs_block_volume_type type;
union {
@@ -82,6 +81,13 @@ struct pnfs_block_volume {
u32 volumes_count;
u32 volumes[PNFS_BLOCK_MAX_DEVICES];
} stripe;
+ struct {
+ enum scsi_code_set code_set;
+ enum scsi_designator_type designator_type;
+ int designator_len;
+ u8 designator[256];
+ u64 pr_key;
+ } scsi;
};
};
@@ -106,6 +112,9 @@ struct pnfs_block_dev {
struct block_device *bdev;
u64 disk_offset;
+ u64 pr_key;
+ bool pr_registered;
+
bool (*map)(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map);
};
@@ -131,6 +140,7 @@ struct pnfs_block_layout {
struct rb_root bl_ext_rw;
struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
+ bool bl_scsi_layout;
};
static inline struct pnfs_block_layout *
@@ -182,6 +192,6 @@ void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
dev_t bl_resolve_deviceid(struct nfs_server *server,
struct pnfs_block_volume *b, gfp_t gfp_mask);
int __init bl_init_pipefs(void);
-void __exit bl_cleanup_pipefs(void);
+void bl_cleanup_pipefs(void);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index a861bbdfe577..e5b89675263e 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -1,11 +1,12 @@
/*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/sunrpc/svc.h>
#include <linux/blkdev.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_xdr.h>
+#include <linux/pr.h>
#include "blocklayout.h"
@@ -21,6 +22,17 @@ bl_free_device(struct pnfs_block_dev *dev)
bl_free_device(&dev->children[i]);
kfree(dev->children);
} else {
+ if (dev->pr_registered) {
+ const struct pr_ops *ops =
+ dev->bdev->bd_disk->fops->pr_ops;
+ int error;
+
+ error = ops->pr_register(dev->bdev, dev->pr_key, 0,
+ false);
+ if (error)
+ pr_err("failed to unregister PR key.\n");
+ }
+
if (dev->bdev)
blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
}
@@ -113,6 +125,24 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
for (i = 0; i < b->stripe.volumes_count; i++)
b->stripe.volumes[i] = be32_to_cpup(p++);
break;
+ case PNFS_BLOCK_VOLUME_SCSI:
+ p = xdr_inline_decode(xdr, 4 + 4 + 4);
+ if (!p)
+ return -EIO;
+ b->scsi.code_set = be32_to_cpup(p++);
+ b->scsi.designator_type = be32_to_cpup(p++);
+ b->scsi.designator_len = be32_to_cpup(p++);
+ p = xdr_inline_decode(xdr, b->scsi.designator_len);
+ if (!p)
+ return -EIO;
+ if (b->scsi.designator_len > 256)
+ return -EIO;
+ memcpy(&b->scsi.designator, p, b->scsi.designator_len);
+ p = xdr_inline_decode(xdr, 8);
+ if (!p)
+ return -EIO;
+ p = xdr_decode_hyper(p, &b->scsi.pr_key);
+ break;
default:
dprintk("unknown volume type!\n");
return -EIO;
@@ -216,6 +246,116 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
}
+static bool
+bl_validate_designator(struct pnfs_block_volume *v)
+{
+ switch (v->scsi.designator_type) {
+ case PS_DESIGNATOR_EUI64:
+ if (v->scsi.code_set != PS_CODE_SET_BINARY)
+ return false;
+
+ if (v->scsi.designator_len != 8 &&
+ v->scsi.designator_len != 10 &&
+ v->scsi.designator_len != 16)
+ return false;
+
+ return true;
+ case PS_DESIGNATOR_NAA:
+ if (v->scsi.code_set != PS_CODE_SET_BINARY)
+ return false;
+
+ if (v->scsi.designator_len != 8 &&
+ v->scsi.designator_len != 16)
+ return false;
+
+ return true;
+ case PS_DESIGNATOR_T10:
+ case PS_DESIGNATOR_NAME:
+ pr_err("pNFS: unsupported designator "
+ "(code set %d, type %d, len %d.\n",
+ v->scsi.code_set,
+ v->scsi.designator_type,
+ v->scsi.designator_len);
+ return false;
+ default:
+ pr_err("pNFS: invalid designator "
+ "(code set %d, type %d, len %d.\n",
+ v->scsi.code_set,
+ v->scsi.designator_type,
+ v->scsi.designator_len);
+ return false;
+ }
+}
+
+static int
+bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
+ struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
+{
+ struct pnfs_block_volume *v = &volumes[idx];
+ const struct pr_ops *ops;
+ const char *devname;
+ int error;
+
+ if (!bl_validate_designator(v))
+ return -EINVAL;
+
+ switch (v->scsi.designator_len) {
+ case 8:
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
+ v->scsi.designator);
+ break;
+ case 12:
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
+ v->scsi.designator);
+ break;
+ case 16:
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
+ v->scsi.designator);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
+ if (IS_ERR(d->bdev)) {
+ pr_warn("pNFS: failed to open device %s (%ld)\n",
+ devname, PTR_ERR(d->bdev));
+ kfree(devname);
+ return PTR_ERR(d->bdev);
+ }
+
+ kfree(devname);
+
+ d->len = i_size_read(d->bdev->bd_inode);
+ d->map = bl_map_simple;
+ d->pr_key = v->scsi.pr_key;
+
+ pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
+ d->bdev->bd_disk->disk_name, d->pr_key);
+
+ ops = d->bdev->bd_disk->fops->pr_ops;
+ if (!ops) {
+ pr_err("pNFS: block device %s does not support reservations.",
+ d->bdev->bd_disk->disk_name);
+ error = -EINVAL;
+ goto out_blkdev_put;
+ }
+
+ error = ops->pr_register(d->bdev, 0, d->pr_key, true);
+ if (error) {
+ pr_err("pNFS: failed to register key for block device %s.",
+ d->bdev->bd_disk->disk_name);
+ goto out_blkdev_put;
+ }
+
+ d->pr_registered = true;
+ return 0;
+
+out_blkdev_put:
+ blkdev_put(d->bdev, FMODE_READ);
+ return error;
+}
+
static int
bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
@@ -303,6 +443,8 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
return bl_parse_concat(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_STRIPE:
return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
+ case PNFS_BLOCK_VOLUME_SCSI:
+ return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
default:
dprintk("unsupported volume type: %d\n", volumes[idx].type);
return -EIO;
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index c59a59c37f3d..720b3ff55fa9 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/vmalloc.h>
@@ -462,10 +462,12 @@ out:
return err;
}
-static size_t ext_tree_layoutupdate_size(size_t count)
+static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
{
- return sizeof(__be32) /* number of entries */ +
- PNFS_BLOCK_EXTENT_SIZE * count;
+ if (bl->bl_scsi_layout)
+ return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
+ else
+ return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
}
static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
@@ -476,12 +478,30 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
for (i = 0; i < nr_pages; i++)
put_page(arg->layoutupdate_pages[i]);
+ vfree(arg->start_p);
kfree(arg->layoutupdate_pages);
} else {
put_page(arg->layoutupdate_page);
}
}
+static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
+{
+ p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
+ NFS4_DEVICEID4_SIZE);
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+ p = xdr_encode_hyper(p, 0LL);
+ *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
+ return p;
+}
+
+static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
+{
+ p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
+ return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
+}
+
static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
size_t buffer_size, size_t *count)
{
@@ -495,19 +515,16 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
continue;
(*count)++;
- if (ext_tree_layoutupdate_size(*count) > buffer_size) {
+ if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
/* keep counting.. */
ret = -ENOSPC;
continue;
}
- p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
- NFS4_DEVICEID4_SIZE);
- p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
- p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
- p = xdr_encode_hyper(p, 0LL);
- *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
-
+ if (bl->bl_scsi_layout)
+ p = encode_scsi_range(be, p);
+ else
+ p = encode_block_extent(be, p);
be->be_tag = EXTENT_COMMITTING;
}
spin_unlock(&bl->bl_ext_lock);
@@ -536,7 +553,7 @@ retry:
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
- buffer_size = ext_tree_layoutupdate_size(count);
+ buffer_size = ext_tree_layoutupdate_size(bl, count);
count = 0;
arg->layoutupdate_pages =
@@ -555,14 +572,19 @@ retry:
}
*start_p = cpu_to_be32(count);
- arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
+ arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
void *p = start_p, *end = p + arg->layoutupdate_len;
+ struct page *page = NULL;
int i = 0;
- for ( ; p < end; p += PAGE_SIZE)
- arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+ arg->start_p = start_p;
+ for ( ; p < end; p += PAGE_SIZE) {
+ page = vmalloc_to_page(p);
+ arg->layoutupdate_pages[i++] = page;
+ get_page(page);
+ }
}
dprintk("%s found %zu ranges\n", __func__, count);
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index dbe5839cdeba..9fb067a6f7e0 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -281,7 +281,7 @@ out:
return ret;
}
-void __exit bl_cleanup_pipefs(void)
+void bl_cleanup_pipefs(void)
{
rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
unregister_pernet_subsys(&nfs4blocklayout_net_ops);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 75f7c0a7538a..a7f2e6e33305 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -99,17 +99,6 @@ nfs4_callback_up(struct svc_serv *serv)
}
#if defined(CONFIG_NFS_V4_1)
-static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-{
- /*
- * Create an svc_sock for the back channel service that shares the
- * fore channel connection.
- * Returns the input port (0) and sets the svc_serv bc_xprt on success
- */
- return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
- SVC_SOCK_ANONYMOUS);
-}
-
/*
* The callback service for NFSv4.1 callbacks
*/
@@ -184,11 +173,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
xprt->bc_serv = serv;
}
#else
-static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-{
- return 0;
-}
-
static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
{
@@ -259,7 +243,8 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
svc_shutdown_net(serv, net);
}
-static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net)
+static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
+ struct net *net, struct rpc_xprt *xprt)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
int ret;
@@ -275,20 +260,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
goto err_bind;
}
- switch (minorversion) {
- case 0:
- ret = nfs4_callback_up_net(serv, net);
- break;
- case 1:
- case 2:
- ret = nfs41_callback_up_net(serv, net);
- break;
- default:
- printk(KERN_ERR "NFS: unknown callback version: %d\n",
- minorversion);
- ret = -EINVAL;
- break;
- }
+ ret = -EPROTONOSUPPORT;
+ if (minorversion == 0)
+ ret = nfs4_callback_up_net(serv, net);
+ else if (xprt->ops->bc_up)
+ ret = xprt->ops->bc_up(serv, net);
if (ret < 0) {
printk(KERN_ERR "NFS: callback service start failed\n");
@@ -364,7 +340,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
goto err_create;
}
- ret = nfs_callback_up_net(minorversion, serv, net);
+ ret = nfs_callback_up_net(minorversion, serv, net, xprt);
if (ret < 0)
goto err_net;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 84326e9fb47a..5fe1cecbf9f0 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -37,10 +37,11 @@ enum nfs4_callback_opnum {
OP_CB_ILLEGAL = 10044,
};
+struct nfs4_slot;
struct cb_process_state {
__be32 drc_status;
struct nfs_client *clp;
- u32 slotid;
+ struct nfs4_slot *slot;
u32 minorversion;
struct net *net;
};
@@ -61,7 +62,6 @@ struct cb_compound_hdr_res {
};
struct cb_getattrargs {
- struct sockaddr *addr;
struct nfs_fh fh;
uint32_t bitmap[2];
};
@@ -76,7 +76,6 @@ struct cb_getattrres {
};
struct cb_recallargs {
- struct sockaddr *addr;
struct nfs_fh fh;
nfs4_stateid stateid;
uint32_t truncate;
@@ -119,9 +118,6 @@ extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
struct cb_sequenceres *res,
struct cb_process_state *cps);
-extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
- const nfs4_stateid *stateid);
-
#define RCA4_TYPE_MASK_RDATA_DLG 0
#define RCA4_TYPE_MASK_WDATA_DLG 1
#define RCA4_TYPE_MASK_DIR_DLG 2
@@ -134,7 +130,6 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
#define RCA4_TYPE_MASK_ALL 0xf31f
struct cb_recallanyargs {
- struct sockaddr *craa_addr;
uint32_t craa_objs_to_keep;
uint32_t craa_type_mask;
};
@@ -144,7 +139,6 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
struct cb_process_state *cps);
struct cb_recallslotargs {
- struct sockaddr *crsa_addr;
uint32_t crsa_target_highest_slotid;
};
extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
@@ -152,7 +146,6 @@ extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
struct cb_process_state *cps);
struct cb_layoutrecallargs {
- struct sockaddr *cbl_addr;
uint32_t cbl_recall_type;
uint32_t cbl_layout_type;
uint32_t cbl_layoutchanged;
@@ -196,9 +189,6 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
#if IS_ENABLED(CONFIG_NFS_V4)
extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
extern void nfs_callback_down(int minorversion, struct net *net);
-extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
- const nfs4_stateid *stateid);
-extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
#endif /* CONFIG_NFS_V4 */
/*
* nfs41: Callbacks are expected to not cause substantial latency,
@@ -209,6 +199,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
#define NFS41_BC_MAX_CALLBACKS 1
extern unsigned int nfs_callback_set_tcpport;
-extern unsigned short nfs_callback_tcpport;
#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index b85cf7a30232..618ced381a14 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -17,9 +17,7 @@
#include "nfs4session.h"
#include "nfs4trace.h"
-#ifdef NFS_DEBUG
#define NFSDBG_FACILITY NFSDBG_CALLBACK
-#endif
__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
struct cb_getattrres *res,
@@ -85,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
res = htonl(NFS4ERR_BADHANDLE);
inode = nfs_delegation_find_inode(cps->clp, &args->fh);
- if (inode == NULL)
+ if (inode == NULL) {
+ trace_nfs4_cb_recall(cps->clp, &args->fh, NULL,
+ &args->stateid, -ntohl(res));
goto out;
+ }
/* Set up a helper thread to actually return the delegation */
switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
case 0:
@@ -98,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
default:
res = htonl(NFS4ERR_RESOURCE);
}
- trace_nfs4_recall_delegation(inode, -ntohl(res));
+ trace_nfs4_cb_recall(cps->clp, &args->fh, inode,
+ &args->stateid, -ntohl(res));
iput(inode);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
@@ -162,6 +164,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
return lo;
}
+/*
+ * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
+ */
+static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new)
+{
+ u32 oldseq, newseq;
+
+ oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ newseq = be32_to_cpu(new->seqid);
+
+ if (newseq > oldseq + 1)
+ return false;
+ return true;
+}
+
static u32 initiate_file_draining(struct nfs_client *clp,
struct cb_layoutrecallargs *args)
{
@@ -171,34 +189,52 @@ static u32 initiate_file_draining(struct nfs_client *clp,
LIST_HEAD(free_me_list);
lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
- if (!lo)
+ if (!lo) {
+ trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
+ &args->cbl_stateid, -rv);
goto out;
+ }
ino = lo->plh_inode;
spin_lock(&ino->i_lock);
+ if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
+ rv = NFS4ERR_DELAY;
+ goto unlock;
+ }
pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
spin_unlock(&ino->i_lock);
pnfs_layoutcommit_inode(ino, false);
spin_lock(&ino->i_lock);
- if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
- pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
- &args->cbl_range)) {
+ /*
+ * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
+ */
+ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
rv = NFS4ERR_DELAY;
goto unlock;
}
+ if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
+ &args->cbl_range)) {
+ rv = NFS4_OK;
+ goto unlock;
+ }
+
if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
&args->cbl_range);
}
+ pnfs_mark_layout_returned_if_empty(lo);
unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(ino, 0);
pnfs_put_layout_hdr(lo);
- trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
+ trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
+ &args->cbl_stateid, -rv);
iput(ino);
out:
return rv;
@@ -318,47 +354,38 @@ out:
* a single outstanding callback request at a time.
*/
static __be32
-validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
+validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot,
+ const struct cb_sequenceargs * args)
{
- struct nfs4_slot *slot;
-
- dprintk("%s enter. slotid %u seqid %u\n",
- __func__, args->csa_slotid, args->csa_sequenceid);
+ dprintk("%s enter. slotid %u seqid %u, slot table seqid: %u\n",
+ __func__, args->csa_slotid, args->csa_sequenceid, slot->seq_nr);
- if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
+ if (args->csa_slotid > tbl->server_highest_slotid)
return htonl(NFS4ERR_BADSLOT);
- slot = tbl->slots + args->csa_slotid;
- dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
-
- /* Normal */
- if (likely(args->csa_sequenceid == slot->seq_nr + 1))
- goto out_ok;
-
/* Replay */
if (args->csa_sequenceid == slot->seq_nr) {
dprintk("%s seqid %u is a replay\n",
__func__, args->csa_sequenceid);
+ if (nfs4_test_locked_slot(tbl, slot->slot_nr))
+ return htonl(NFS4ERR_DELAY);
/* Signal process_op to set this error on next op */
if (args->csa_cachethis == 0)
return htonl(NFS4ERR_RETRY_UNCACHED_REP);
- /* The ca_maxresponsesize_cached is 0 with no DRC */
- else if (args->csa_cachethis == 1)
- return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
+ /* Liar! We never allowed you to set csa_cachethis != 0 */
+ return htonl(NFS4ERR_SEQ_FALSE_RETRY);
}
/* Wraparound */
- if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
- slot->seq_nr = 1;
- goto out_ok;
- }
+ if (unlikely(slot->seq_nr == 0xFFFFFFFFU)) {
+ if (args->csa_sequenceid == 1)
+ return htonl(NFS4_OK);
+ } else if (likely(args->csa_sequenceid == slot->seq_nr + 1))
+ return htonl(NFS4_OK);
/* Misordered request */
return htonl(NFS4ERR_SEQ_MISORDERED);
-out_ok:
- tbl->highest_used_slotid = args->csa_slotid;
- return htonl(NFS4_OK);
}
/*
@@ -437,6 +464,12 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
tbl = &clp->cl_session->bc_slot_table;
slot = tbl->slots + args->csa_slotid;
+ /* Set up res before grabbing the spinlock */
+ memcpy(&res->csr_sessionid, &args->csa_sessionid,
+ sizeof(res->csr_sessionid));
+ res->csr_sequenceid = args->csa_sequenceid;
+ res->csr_slotid = args->csa_slotid;
+
spin_lock(&tbl->slot_tbl_lock);
/* state manager is resetting the session */
if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
@@ -449,18 +482,26 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
goto out_unlock;
}
- memcpy(&res->csr_sessionid, &args->csa_sessionid,
- sizeof(res->csr_sessionid));
- res->csr_sequenceid = args->csa_sequenceid;
- res->csr_slotid = args->csa_slotid;
- res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
- res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
+ status = htonl(NFS4ERR_BADSLOT);
+ slot = nfs4_lookup_slot(tbl, args->csa_slotid);
+ if (IS_ERR(slot))
+ goto out_unlock;
+
+ res->csr_highestslotid = tbl->server_highest_slotid;
+ res->csr_target_highestslotid = tbl->target_highest_slotid;
- status = validate_seqid(tbl, args);
+ status = validate_seqid(tbl, slot, args);
if (status)
goto out_unlock;
+ if (!nfs4_try_to_lock_slot(tbl, slot)) {
+ status = htonl(NFS4ERR_DELAY);
+ goto out_unlock;
+ }
+ cps->slot = slot;
- cps->slotid = args->csa_slotid;
+ /* The ca_maxresponsesize_cached is 0 with no DRC */
+ if (args->csa_cachethis != 0)
+ return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE);
/*
* Check for pending referring calls. If a match is found, a
@@ -477,7 +518,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
* If CB_SEQUENCE returns an error, then the state of the slot
* (sequence ID, cached reply) MUST NOT change.
*/
- slot->seq_nr++;
+ slot->seq_nr = args->csa_sequenceid;
out_unlock:
spin_unlock(&tbl->slot_tbl_lock);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 6b1697a01dde..976c90608e56 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -18,19 +18,21 @@
#include "internal.h"
#include "nfs4session.h"
-#define CB_OP_TAGLEN_MAXSZ (512)
-#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
-#define CB_OP_GETATTR_BITMAP_MAXSZ (4)
-#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
- CB_OP_GETATTR_BITMAP_MAXSZ + \
- 2 + 2 + 3 + 3)
-#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_TAGLEN_MAXSZ (512)
+#define CB_OP_HDR_RES_MAXSZ (2 * 4) // opcode, status
+#define CB_OP_GETATTR_BITMAP_MAXSZ (4 * 4) // bitmap length, 3 bitmaps
+#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
+ CB_OP_GETATTR_BITMAP_MAXSZ + \
+ /* change, size, ctime, mtime */\
+ (2 + 2 + 3 + 3) * 4)
+#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#if defined(CONFIG_NFS_V4_1)
#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
- 4 + 1 + 3)
+ NFS4_MAX_SESSIONID_LEN + \
+ (1 + 3) * 4) // seqid, 3 slotids
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#endif /* CONFIG_NFS_V4_1 */
@@ -157,7 +159,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
if (unlikely(status != 0))
return status;
/* We do not like overly long tags! */
- if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
+ if (hdr->taglen > CB_OP_TAGLEN_MAXSZ) {
printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",
__func__, hdr->taglen);
return htonl(NFS4ERR_RESOURCE);
@@ -198,7 +200,6 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
status = decode_fh(xdr, &args->fh);
if (unlikely(status != 0))
goto out;
- args->addr = svc_addr(rqstp);
status = decode_bitmap(xdr, args->bitmap);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
@@ -210,7 +211,6 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
__be32 *p;
__be32 status;
- args->addr = svc_addr(rqstp);
status = decode_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
goto out;
@@ -236,7 +236,6 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
__be32 status = 0;
uint32_t iomode;
- args->cbl_addr = svc_addr(rqstp);
p = read_buf(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
@@ -383,13 +382,12 @@ static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
__be32 *p;
- int len = NFS4_MAX_SESSIONID_LEN;
- p = read_buf(xdr, len);
+ p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(sid->data, p, len);
+ memcpy(sid->data, p, NFS4_MAX_SESSIONID_LEN);
return 0;
}
@@ -500,7 +498,6 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
uint32_t bitmap[2];
__be32 *p, status;
- args->craa_addr = svc_addr(rqstp);
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -519,7 +516,6 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
{
__be32 *p;
- args->crsa_addr = svc_addr(rqstp);
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -684,13 +680,12 @@ static __be32 encode_sessionid(struct xdr_stream *xdr,
const struct nfs4_sessionid *sid)
{
__be32 *p;
- int len = NFS4_MAX_SESSIONID_LEN;
- p = xdr_reserve_space(xdr, len);
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(p, sid, len);
+ memcpy(p, sid, NFS4_MAX_SESSIONID_LEN);
return 0;
}
@@ -704,7 +699,9 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
if (unlikely(status != 0))
goto out;
- encode_sessionid(xdr, &res->csr_sessionid);
+ status = encode_sessionid(xdr, &res->csr_sessionid);
+ if (status)
+ goto out;
p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL))
@@ -755,7 +752,8 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
return htonl(NFS_OK);
}
-static void nfs4_callback_free_slot(struct nfs4_session *session)
+static void nfs4_callback_free_slot(struct nfs4_session *session,
+ struct nfs4_slot *slot)
{
struct nfs4_slot_table *tbl = &session->bc_slot_table;
@@ -764,15 +762,17 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
* Let the state manager know callback processing done.
* A single slot, so highest used slotid is either 0 or -1
*/
- tbl->highest_used_slotid = NFS4_NO_SLOT;
+ nfs4_free_slot(tbl, slot);
nfs4_slot_tbl_drain_complete(tbl);
spin_unlock(&tbl->slot_tbl_lock);
}
static void nfs4_cb_free_slot(struct cb_process_state *cps)
{
- if (cps->slotid != NFS4_NO_SLOT)
- nfs4_callback_free_slot(cps->clp->cl_session);
+ if (cps->slot) {
+ nfs4_callback_free_slot(cps->clp->cl_session, cps->slot);
+ cps->slot = NULL;
+ }
}
#else /* CONFIG_NFS_V4_1 */
@@ -896,7 +896,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
struct cb_process_state cps = {
.drc_status = 0,
.clp = NULL,
- .slotid = NFS4_NO_SLOT,
.net = SVC_NET(rqstp),
};
unsigned int nops = 0;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 57c5a02f6213..0c96528db94a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -736,7 +736,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->rsize = max_rpc_payload;
if (server->rsize > NFS_MAX_FILE_IO_SIZE)
server->rsize = NFS_MAX_FILE_IO_SIZE;
- server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
server->backing_dev_info.name = "nfs";
server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
@@ -745,13 +745,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->wsize = max_rpc_payload;
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
- server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
- if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
- server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
+ if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES)
+ server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES;
if (server->dtsize > server->rsize)
server->dtsize = server->rsize;
@@ -764,6 +764,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->time_delta = fsinfo->time_delta;
+ server->clone_blksize = fsinfo->clone_blksize;
/* We're airborne Set socket buffersize */
rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index be806ead7f4d..5166adcfc0fb 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -721,14 +721,12 @@ int nfs_async_inode_return_delegation(struct inode *inode,
struct nfs_client *clp = server->nfs_client;
struct nfs_delegation *delegation;
- filemap_flush(inode->i_mapping);
-
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
if (delegation == NULL)
goto out_enoent;
-
- if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
+ if (stateid != NULL &&
+ !clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
goto out_enoent;
nfs_mark_return_delegation(server, delegation);
rcu_read_unlock();
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3d8e4ffa0a33..33eb81738d03 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -377,7 +377,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
again:
timestamp = jiffies;
gencount = nfs_inc_attr_generation_counter();
- error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, entry->cookie, pages,
+ error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
NFS_SERVER(inode)->dtsize, desc->plus);
if (error < 0) {
/* We requested READDIRPLUS, but the server doesn't grok it */
@@ -560,7 +560,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
count++;
if (desc->plus != 0)
- nfs_prime_dcache(desc->file->f_path.dentry, entry);
+ nfs_prime_dcache(file_dentry(desc->file), entry);
status = nfs_readdir_add_to_array(entry, page);
if (status != 0)
@@ -707,7 +707,7 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
{
if (!desc->page->mapping)
nfs_readdir_clear_array(desc->page);
- page_cache_release(desc->page);
+ put_page(desc->page);
desc->page = NULL;
}
@@ -864,7 +864,7 @@ static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
*/
static int nfs_readdir(struct file *file, struct dir_context *ctx)
{
- struct dentry *dentry = file->f_path.dentry;
+ struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
nfs_readdir_descriptor_t my_desc,
*desc = &my_desc;
@@ -940,7 +940,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
filp, offset, whence);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
switch (whence) {
case 1:
offset += filp->f_pos;
@@ -957,7 +957,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
dir_ctx->duped = 0;
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return offset;
}
@@ -972,9 +972,9 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return 0;
}
@@ -1360,19 +1360,15 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry);
nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
- res = ERR_PTR(-ENAMETOOLONG);
- if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
- goto out;
+ if (unlikely(dentry->d_name.len > NFS_SERVER(dir)->namelen))
+ return ERR_PTR(-ENAMETOOLONG);
/*
* If we're doing an exclusive create, optimize away the lookup
* but don't hash the dentry.
*/
- if (nfs_is_exclusive_create(dir, flags)) {
- d_instantiate(dentry, NULL);
- res = NULL;
- goto out;
- }
+ if (nfs_is_exclusive_create(dir, flags))
+ return NULL;
res = ERR_PTR(-ENOMEM);
fhandle = nfs_alloc_fhandle();
@@ -1714,9 +1710,6 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
@@ -1897,15 +1890,14 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
attr.ia_mode = S_IFLNK | S_IRWXUGO;
attr.ia_valid = ATTR_MODE;
- page = alloc_page(GFP_HIGHUSER);
+ page = alloc_page(GFP_USER);
if (!page)
return -ENOMEM;
- kaddr = kmap_atomic(page);
+ kaddr = page_address(page);
memcpy(kaddr, symname, pathlen);
if (pathlen < PAGE_SIZE)
memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
- kunmap_atomic(kaddr);
trace_nfs_symlink_enter(dir, dentry);
error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
@@ -1931,7 +1923,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
* add_to_page_cache_lru() grabs an extra page refcount.
* Drop it here to avoid leaking this page later.
*/
- page_cache_release(page);
+ put_page(page);
} else
__free_page(page);
@@ -2435,6 +2427,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
}
EXPORT_SYMBOL_GPL(nfs_may_open);
+static int nfs_execute_ok(struct inode *inode, int mask)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ int ret;
+
+ if (mask & MAY_NOT_BLOCK)
+ ret = nfs_revalidate_inode_rcu(server, inode);
+ else
+ ret = nfs_revalidate_inode(server, inode);
+ if (ret == 0 && !execute_ok(inode))
+ ret = -EACCES;
+ return ret;
+}
+
int nfs_permission(struct inode *inode, int mask)
{
struct rpc_cred *cred;
@@ -2452,6 +2458,9 @@ int nfs_permission(struct inode *inode, int mask)
case S_IFLNK:
goto out;
case S_IFREG:
+ if ((mask & MAY_OPEN) &&
+ nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN))
+ return 0;
break;
case S_IFDIR:
/*
@@ -2484,8 +2493,8 @@ force_lookup:
res = PTR_ERR(cred);
}
out:
- if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
- res = -EACCES;
+ if (!res && (mask & MAY_EXEC))
+ res = nfs_execute_ok(inode, mask);
dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
inode->i_sb->s_id, inode->i_ino, mask, res);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4b1d08f56aba..c93826e4a8c6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
return atomic_dec_and_test(&dreq->io_count);
}
-void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
-{
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-}
-EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
-
static void
nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
{
@@ -275,7 +269,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
{
unsigned int i;
for (i = 0; i < npages; i++)
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
@@ -586,7 +580,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
if (!count)
goto out;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
result = nfs_sync_mapping(mapping);
if (result)
goto out_unlock;
@@ -614,7 +608,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
NFS_I(inode)->read_io += count;
result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!result) {
result = nfs_direct_wait(dreq);
@@ -628,7 +622,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
out_release:
nfs_direct_req_release(dreq);
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out:
return result;
}
@@ -670,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
req = nfs_list_entry(reqs.next);
nfs_direct_setup_mirroring(dreq, &desc, req);
+ if (desc.pg_error < 0) {
+ list_splice_init(&reqs, &failed);
+ goto out_failed;
+ }
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
if (!nfs_pageio_add_request(&desc, req)) {
@@ -677,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_list_add_request(req, &failed);
spin_lock(cinfo.lock);
dreq->flags = 0;
- dreq->error = -EIO;
+ if (desc.pg_error < 0)
+ dreq->error = desc.pg_error;
+ else
+ dreq->error = -EIO;
spin_unlock(cinfo.lock);
}
nfs_release_request(req);
}
nfs_pageio_complete(&desc);
+out_failed:
while (!list_empty(&failed)) {
req = nfs_list_entry(failed.next);
nfs_list_remove_request(req);
@@ -727,14 +729,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
nfs_direct_write_complete(dreq, data->inode);
}
-static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
+static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
{
- /* There is no lock to clear */
+ struct nfs_direct_req *dreq = cinfo->dreq;
+
+ spin_lock(&dreq->lock);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ nfs_mark_request_commit(req, NULL, cinfo, 0);
}
static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
.completion = nfs_direct_commit_complete,
- .error_cleanup = nfs_direct_error_cleanup,
+ .resched_write = nfs_direct_resched_write,
};
static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
@@ -839,10 +847,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head)
}
}
+static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ struct nfs_direct_req *dreq = hdr->dreq;
+
+ spin_lock(&dreq->lock);
+ if (dreq->error == 0) {
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ /* fake unstable write to let common nfs resend pages */
+ hdr->verf.committed = NFS_UNSTABLE;
+ hdr->good_bytes = hdr->args.count;
+ }
+ spin_unlock(&dreq->lock);
+}
+
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
.error_cleanup = nfs_write_sync_pgio_error,
.init_hdr = nfs_direct_pgio_init,
.completion = nfs_direct_write_completion,
+ .reschedule_io = nfs_direct_write_reschedule_io,
};
@@ -900,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
}
nfs_direct_setup_mirroring(dreq, &desc, req);
+ if (desc.pg_error < 0) {
+ nfs_free_request(req);
+ result = desc.pg_error;
+ break;
+ }
nfs_lock_request(req);
req->wb_index = pos >> PAGE_SHIFT;
@@ -975,9 +1003,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
iov_iter_count(iter));
pos = iocb->ki_pos;
- end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT;
+ end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
result = nfs_sync_mapping(mapping);
if (result)
@@ -985,7 +1013,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
if (mapping->nrpages) {
result = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT, end);
+ pos >> PAGE_SHIFT, end);
if (result)
goto out_unlock;
}
@@ -1014,10 +1042,10 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT, end);
+ pos >> PAGE_SHIFT, end);
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!result) {
result = nfs_direct_wait(dreq);
@@ -1038,7 +1066,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
out_release:
nfs_direct_req_release(dreq);
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return result;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 37f639d50af5..be01095b97ae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(nfs_file_mmap);
* nfs_file_write() that a write error occurred, and hence cause it to
* fall back to doing a synchronous write.
*/
-int
+static int
nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -263,9 +263,8 @@ nfs_file_fsync_commit(struct file *file, loff_t start, loff_t end, int datasync)
out:
return ret;
}
-EXPORT_SYMBOL_GPL(nfs_file_fsync_commit);
-static int
+int
nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
int ret;
@@ -273,14 +272,16 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
trace_nfs_fsync_enter(inode);
- nfs_inode_dio_wait(inode);
+ inode_dio_wait(inode);
do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0)
break;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = nfs_file_fsync_commit(file, start, end, datasync);
- mutex_unlock(&inode->i_mutex);
+ if (!ret)
+ ret = pnfs_sync_inode(inode, !!datasync);
+ inode_unlock(inode);
/*
* If nfs_file_fsync_commit detected a server reboot, then
* resend all dirty pages that might have been covered by
@@ -293,6 +294,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
trace_nfs_fsync_exit(inode, ret);
return ret;
}
+EXPORT_SYMBOL_GPL(nfs_file_fsync);
/*
* Decide whether a read/modify/write cycle may be more efficient
@@ -318,7 +320,7 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
loff_t pos, unsigned len)
{
unsigned int pglen = nfs_page_length(page);
- unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned int offset = pos & (PAGE_SIZE - 1);
unsigned int end = offset + len;
if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
@@ -349,7 +351,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
int ret;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int once_thru = 0;
@@ -368,7 +370,7 @@ start:
/*
* Wait for O_DIRECT to complete
*/
- nfs_inode_dio_wait(mapping->host);
+ inode_dio_wait(mapping->host);
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
@@ -378,12 +380,12 @@ start:
ret = nfs_flush_incompatible(file, page);
if (ret) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
} else if (!once_thru &&
nfs_want_read_modify_write(file, page, pos, len)) {
once_thru = 1;
ret = nfs_readpage(file, page);
- page_cache_release(page);
+ put_page(page);
if (!ret)
goto start;
}
@@ -394,7 +396,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned offset = pos & (PAGE_SIZE - 1);
struct nfs_open_context *ctx = nfs_file_open_context(file);
int status;
@@ -411,20 +413,20 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
if (pglen == 0) {
zero_user_segments(page, 0, offset,
- end, PAGE_CACHE_SIZE);
+ end, PAGE_SIZE);
SetPageUptodate(page);
} else if (end >= pglen) {
- zero_user_segment(page, end, PAGE_CACHE_SIZE);
+ zero_user_segment(page, end, PAGE_SIZE);
if (offset == 0)
SetPageUptodate(page);
} else
- zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+ zero_user_segment(page, pglen, PAGE_SIZE);
}
status = nfs_updatepage(file, page, offset, copied);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (status < 0)
return status;
@@ -452,7 +454,7 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
page, offset, length);
- if (offset != 0 || length < PAGE_CACHE_SIZE)
+ if (offset != 0 || length < PAGE_SIZE)
return;
/* Cancel any unstarted writes on this page */
nfs_wb_page_cancel(page_file_mapping(page)->host, page);
@@ -473,8 +475,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
/* Always try to initiate a 'commit' if relevant, but only
- * wait for it if __GFP_WAIT is set. Even then, only wait 1
- * second and only if the 'bdi' is not congested.
+ * wait for it if the caller allows blocking. Even then,
+ * only wait 1 second and only if the 'bdi' is not congested.
* Waiting indefinitely can cause deadlocks when the NFS
* server is on this machine, when a new TCP connection is
* needed and in other rare cases. There is no particular
@@ -484,7 +486,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
if (mapping) {
struct nfs_server *nfss = NFS_SERVER(mapping->host);
nfs_commit_inode(mapping->host, 0);
- if ((gfp & __GFP_WAIT) &&
+ if (gfpflags_allow_blocking(gfp) &&
!bdi_write_congested(&nfss->backing_dev_info)) {
wait_on_page_bit_killable_timeout(page, PG_private,
HZ);
@@ -514,7 +516,7 @@ static void nfs_check_dirty_writeback(struct page *page,
* so it will not block due to pages that will shortly be freeable.
*/
nfsi = NFS_I(mapping->host);
- if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+ if (atomic_read(&nfsi->commit_info.rpcs_out)) {
*writeback = true;
return;
}
@@ -545,7 +547,7 @@ static int nfs_launder_page(struct page *page)
inode->i_ino, (long long)page_offset(page));
nfs_fscache_wait_on_page_write(nfsi, page);
- return nfs_wb_page(inode, page);
+ return nfs_wb_launder_page(inode, page);
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -756,7 +758,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
if (!IS_ERR(l_ctx)) {
- status = nfs_iocounter_wait(&l_ctx->io_count);
+ status = nfs_iocounter_wait(l_ctx);
nfs_put_lock_context(l_ctx);
if (status < 0)
return status;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 02ec07973bc4..3384dc8e6683 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
pnfs_error_mark_layout_for_return(inode, lseg);
+ pnfs_set_lo_fail(lseg);
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
@@ -883,13 +884,19 @@ static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_read_mds(pgio);
@@ -902,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_commit_info cinfo;
int status;
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
+
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
@@ -957,7 +971,7 @@ filelayout_mark_request_commit(struct nfs_page *req,
u32 i, j;
if (fl->commit_through_mds) {
- nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+ nfs_request_add_commit_list(req, cinfo);
} else {
/* Note that we are calling nfs4_fl_calc_j_index on each page
* that ends up being committed to a data server. An attractive
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index fbc5a56de875..0cb1abd535e3 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -145,7 +145,7 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
return false;
for (i = 0; i < m1->fh_versions_cnt; i++) {
bool found_fh = false;
- for (j = 0; j < m2->fh_versions_cnt; i++) {
+ for (j = 0; j < m2->fh_versions_cnt; j++) {
if (nfs_compare_fh(&m1->fh_versions[i],
&m2->fh_versions[j]) == 0) {
found_fh = true;
@@ -339,6 +339,19 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
}
}
+static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls)
+{
+ struct nfs4_deviceid_node *node;
+ int i;
+
+ if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS))
+ return;
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ node = &fls->mirror_array[i]->mirror_ds->id_node;
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ }
+}
+
static struct pnfs_layout_segment *
ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_layoutget_res *lgr,
@@ -492,13 +505,22 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
}
p = xdr_inline_decode(&stream, 4);
- if (p)
- fls->flags = be32_to_cpup(p);
+ if (!p)
+ goto out_sort_mirrors;
+ fls->flags = be32_to_cpup(p);
+
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_sort_mirrors;
+ for (i=0; i < fls->mirror_array_cnt; i++)
+ fls->mirror_array[i]->report_interval = be32_to_cpup(p);
+out_sort_mirrors:
ff_layout_sort_mirrors(fls);
rc = ff_layout_check_layout(lgr);
if (rc)
goto out_err_free;
+ ff_layout_mark_devices_valid(fls);
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
@@ -589,7 +611,9 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
mirror->start_time = now;
if (ktime_equal(mirror->last_report_time, notime))
mirror->last_report_time = now;
- if (layoutstats_timer != 0)
+ if (mirror->report_interval != 0)
+ report_interval = (s64)mirror->report_interval * 1000LL;
+ else if (layoutstats_timer != 0)
report_interval = (s64)layoutstats_timer * 1000LL;
if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
report_interval) {
@@ -741,17 +765,17 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
}
static struct nfs4_pnfs_ds *
-ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx,
int *best_idx)
{
- struct nfs4_ff_layout_segment *fls;
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
struct nfs4_pnfs_ds *ds;
int idx;
- fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
/* mirrors are sorted by efficiency */
- for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
- ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
if (ds) {
*best_idx = idx;
return ds;
@@ -771,18 +795,24 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
/* Use full layout for now */
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
- ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+ ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
if (!ds)
goto out_mds;
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
@@ -811,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
int i;
int status;
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
goto out_mds;
@@ -853,18 +889,25 @@ static unsigned int
ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- if (!pgio->pg_lseg)
+ if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ goto out;
+ }
+ }
if (pgio->pg_lseg)
return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
/* no lseg means that pnfs is not in use, so no mirroring here */
nfs_pageio_reset_write_mds(pgio);
+out:
return 1;
}
@@ -898,18 +941,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
hdr->args.count,
(unsigned long long)hdr->args.offset);
- if (!hdr->dreq) {
- struct nfs_open_context *ctx;
-
- ctx = nfs_list_entry(hdr->pages.next)->wb_context;
- set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
- hdr->completion_ops->error_cleanup(&hdr->pages);
- } else {
- nfs_direct_set_resched_writes(hdr->dreq);
- /* fake unstable write to let common nfs resend pages */
- hdr->verf.committed = NFS_UNSTABLE;
- hdr->good_bytes = hdr->args.count;
- }
+ hdr->completion_ops->reschedule_io(hdr);
return;
}
@@ -1035,7 +1067,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
- if (ff_layout_has_available_ds(lseg))
+ if (ff_layout_no_fallback_to_mds(lseg) ||
+ ff_layout_has_available_ds(lseg))
return -NFS4ERR_RESET_TO_PNFS;
reset:
dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -1086,7 +1119,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
return -NFS4ERR_RESET_TO_PNFS;
out_retry:
task->tk_status = 0;
- rpc_restart_call(task);
+ rpc_restart_call_prepare(task);
rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
return -EAGAIN;
}
@@ -1144,6 +1177,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
}
}
+ switch (status) {
+ case NFS4ERR_DELAY:
+ case NFS4ERR_GRACE:
+ return;
+ default:
+ break;
+ }
+
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
@@ -1153,7 +1194,6 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
}
/* NFS_PROTO call done callback routines */
-
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -1171,7 +1211,11 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ if (ff_layout_choose_best_ds_for_read(hdr->lseg,
+ hdr->pgio_mirror_idx + 1,
+ &hdr->pgio_mirror_idx))
+ goto out_eagain;
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED,
&hdr->lseg->pls_layout->plh_flags);
pnfs_read_resend_pnfs(hdr);
return task->tk_status;
@@ -1179,11 +1223,13 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
ff_layout_reset_read(hdr);
return task->tk_status;
case -EAGAIN:
- rpc_restart_call_prepare(task);
- return -EAGAIN;
+ goto out_eagain;
}
return 0;
+out_eagain:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
}
static bool
@@ -1222,14 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
return ff_layout_test_devid_unavailable(node);
}
-static int ff_layout_read_prepare_common(struct rpc_task *task,
- struct nfs_pgio_header *hdr)
+static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
nfs4_ff_layout_stat_io_start_read(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count,
task->tk_start);
+}
+
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_read(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count,
+ hdr->res.count);
+}
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1245,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
}
hdr->pgio_done_cb = ff_layout_read_done_cb;
+ ff_layout_read_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1303,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
- nfs4_ff_layout_stat_io_end_read(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1321,10 +1381,20 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ ff_layout_read_record_layoutstats_done(task, hdr);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
}
+static void ff_layout_read_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
+ pnfs_generic_rw_release(data);
+}
+
+
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -1342,15 +1412,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
ff_layout_reset_write(hdr, true);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
- pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
ff_layout_reset_write(hdr, false);
return task->tk_status;
case -EAGAIN:
- rpc_restart_call_prepare(task);
return -EAGAIN;
}
@@ -1382,11 +1449,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
- pnfs_set_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -NFS4ERR_RESET_TO_MDS:
- pnfs_clear_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -EAGAIN:
@@ -1401,14 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return 0;
}
-static int ff_layout_write_prepare_common(struct rpc_task *task,
- struct nfs_pgio_header *hdr)
+static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
nfs4_ff_layout_stat_io_start_write(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count,
task->tk_start);
+}
+
+static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+ return;
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+ hdr->args.count, hdr->res.count,
+ hdr->res.verf->committed);
+}
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@@ -1425,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
return -EAGAIN;
}
+ ff_layout_write_record_layoutstats_start(task, hdr);
return 0;
}
@@ -1460,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count, hdr->res.count,
- hdr->res.verf->committed);
-
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1479,18 +1557,53 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
+ ff_layout_write_record_layoutstats_done(task, hdr);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
}
-static void ff_layout_commit_prepare_common(struct rpc_task *task,
+static void ff_layout_write_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
+ pnfs_generic_rw_release(data);
+}
+
+static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
+ if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
nfs4_ff_layout_stat_io_start_write(cdata->inode,
FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
0, task->tk_start);
}
+static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ struct nfs_page *req;
+ __u64 count = 0;
+
+ if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
+ return;
+
+ if (task->tk_status == 0) {
+ list_for_each_entry(req, &cdata->pages, wb_list)
+ count += req->wb_bytes;
+ }
+ nfs4_ff_layout_stat_io_end_write(task,
+ FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+ count, count, NFS_FILE_SYNC);
+}
+
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+ struct nfs_commit_data *cdata)
+{
+ ff_layout_commit_record_layoutstats_start(task, cdata);
+}
+
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
{
ff_layout_commit_prepare_common(task, data);
@@ -1511,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
static void ff_layout_commit_done(struct rpc_task *task, void *data)
{
- struct nfs_commit_data *cdata = data;
- struct nfs_page *req;
- __u64 count = 0;
-
- if (task->tk_status == 0) {
- list_for_each_entry(req, &cdata->pages, wb_list)
- count += req->wb_bytes;
- }
-
- nfs4_ff_layout_stat_io_end_write(task,
- FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
- count, count, NFS_FILE_SYNC);
-
pnfs_generic_write_commit_done(task, data);
}
@@ -1531,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
{
struct nfs_commit_data *cdata = data;
+ ff_layout_commit_record_layoutstats_done(task, cdata);
rpc_count_iostats_metrics(task,
&NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
}
+static void ff_layout_commit_release(void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
+ pnfs_generic_commit_release(data);
+}
+
static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
.rpc_call_prepare = ff_layout_read_prepare_v3,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_read_release,
};
static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
.rpc_call_prepare = ff_layout_read_prepare_v4,
.rpc_call_done = ff_layout_read_call_done,
.rpc_count_stats = ff_layout_read_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_read_release,
};
static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
.rpc_call_prepare = ff_layout_write_prepare_v3,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_write_release,
};
static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
.rpc_call_prepare = ff_layout_write_prepare_v4,
.rpc_call_done = ff_layout_write_call_done,
.rpc_count_stats = ff_layout_write_count_stats,
- .rpc_release = pnfs_generic_rw_release,
+ .rpc_release = ff_layout_write_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
.rpc_call_prepare = ff_layout_commit_prepare_v3,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
- .rpc_release = pnfs_generic_commit_release,
+ .rpc_release = ff_layout_commit_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
.rpc_call_prepare = ff_layout_commit_prepare_v4,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
- .rpc_release = pnfs_generic_commit_release,
+ .rpc_release = ff_layout_commit_release,
};
static enum pnfs_try_status
@@ -1839,11 +1948,9 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
start = xdr_reserve_space(xdr, 4);
BUG_ON(!start);
- if (ff_layout_encode_ioerr(flo, xdr, args))
- goto out;
-
+ ff_layout_encode_ioerr(flo, xdr, args);
ff_layout_encode_iostats(flo, xdr, args);
-out:
+
*start = cpu_to_be32((xdr->p - start - 1) * 4);
dprintk("%s: Return\n", __func__);
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 68cc0d9828f9..dd353bb7dc0a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -10,6 +10,7 @@
#define FS_NFS_NFS4FLEXFILELAYOUT_H
#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS 2
#include "../pnfs.h"
@@ -84,6 +85,7 @@ struct nfs4_ff_layout_mirror {
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
ktime_t last_report_time;
+ u32 report_interval;
};
struct nfs4_ff_layout_segment {
@@ -146,6 +148,12 @@ FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
}
static inline bool
+ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS;
+}
+
+static inline bool
ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
{
return nfs4_test_deviceid_unavailable(node);
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e125e55de86d..add0e5a70bd6 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -218,63 +218,55 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
err->length = end - err->offset;
}
-static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset,
- u64 length, int status, enum nfs_opnum4 opnum,
- nfs4_stateid *stateid,
- struct nfs4_deviceid *deviceid)
+static int
+ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
+ const struct nfs4_ff_layout_ds_err *e2)
{
- return err->status == status && err->opnum == opnum &&
- nfs4_stateid_match(&err->stateid, stateid) &&
- !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
- end_offset(err->offset, err->length) >= offset &&
- err->offset <= end_offset(offset, length);
-}
-
-static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
- struct nfs4_ff_layout_ds_err *new)
-{
- if (!ds_error_can_merge(old, new->offset, new->length, new->status,
- new->opnum, &new->stateid, &new->deviceid))
- return false;
-
- extend_ds_error(old, new->offset, new->length);
- return true;
+ int ret;
+
+ if (e1->opnum != e2->opnum)
+ return e1->opnum < e2->opnum ? -1 : 1;
+ if (e1->status != e2->status)
+ return e1->status < e2->status ? -1 : 1;
+ ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+ if (ret != 0)
+ return ret;
+ ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
+ if (ret != 0)
+ return ret;
+ if (end_offset(e1->offset, e1->length) < e2->offset)
+ return -1;
+ if (e1->offset > end_offset(e2->offset, e2->length))
+ return 1;
+ /* If ranges overlap or are contiguous, they are the same */
+ return 0;
}
-static bool
+static void
ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
struct nfs4_ff_layout_ds_err *dserr)
{
- struct nfs4_ff_layout_ds_err *err;
-
- list_for_each_entry(err, &flo->error_list, list) {
- if (merge_ds_error(err, dserr)) {
- return true;
- }
- }
-
- list_add(&dserr->list, &flo->error_list);
- return false;
-}
-
-static bool
-ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
- u64 length, int status, enum nfs_opnum4 opnum,
- nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
-{
- bool found = false;
- struct nfs4_ff_layout_ds_err *err;
-
- list_for_each_entry(err, &flo->error_list, list) {
- if (ds_error_can_merge(err, offset, length, status, opnum,
- stateid, deviceid)) {
- found = true;
- extend_ds_error(err, offset, length);
+ struct nfs4_ff_layout_ds_err *err, *tmp;
+ struct list_head *head = &flo->error_list;
+ int match;
+
+ /* Do insertion sort w/ merges */
+ list_for_each_entry_safe(err, tmp, &flo->error_list, list) {
+ match = ff_ds_error_match(err, dserr);
+ if (match < 0)
+ continue;
+ if (match > 0) {
+ /* Add entry "dserr" _before_ entry "err" */
+ head = &err->list;
break;
}
+ /* Entries match, so merge "err" into "dserr" */
+ extend_ds_error(dserr, err->offset, err->length);
+ list_del(&err->list);
+ kfree(err);
}
- return found;
+ list_add_tail(&dserr->list, head);
}
int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
@@ -283,7 +275,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
gfp_t gfp_flags)
{
struct nfs4_ff_layout_ds_err *dserr;
- bool needfree;
if (status == 0)
return 0;
@@ -291,14 +282,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
if (mirror->mirror_ds == NULL)
return -EINVAL;
- spin_lock(&flo->generic_hdr.plh_inode->i_lock);
- if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
- &mirror->stateid,
- &mirror->mirror_ds->id_node.deviceid)) {
- spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
- return 0;
- }
- spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
dserr = kmalloc(sizeof(*dserr), gfp_flags);
if (!dserr)
return -ENOMEM;
@@ -313,10 +296,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
NFS4_DEVICEID4_SIZE);
spin_lock(&flo->generic_hdr.plh_inode->i_lock);
- needfree = ff_layout_add_ds_error_locked(flo, dserr);
+ ff_layout_add_ds_error_locked(flo, dserr);
spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
- if (needfree)
- kfree(dserr);
return 0;
}
@@ -429,22 +410,16 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror, lseg->pls_range.offset,
lseg->pls_range.length, NFS4ERR_NXIO,
OP_ILLEGAL, GFP_NOIO);
- if (fail_return) {
- pnfs_error_mark_layout_for_return(ino, lseg);
- if (ff_layout_has_available_ds(lseg))
- pnfs_set_retry_layoutget(lseg->pls_layout);
- else
- pnfs_clear_retry_layoutget(lseg->pls_layout);
-
- } else {
+ if (!fail_return) {
if (ff_layout_has_available_ds(lseg))
- set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED,
&lseg->pls_layout->plh_flags);
- else {
+ else
pnfs_error_mark_layout_for_return(ino, lseg);
- pnfs_clear_retry_layoutget(lseg->pls_layout);
- }
- }
+ } else
+ pnfs_error_mark_layout_for_return(ino, lseg);
+ ds = NULL;
+ goto out;
}
out_update_creds:
if (ff_layout_update_mirror_cred(mirror, ds))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 326d9e10d833..738c84a42eb0 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -71,19 +71,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
return nfs_fileid_to_ino_t(fattr->fileid);
}
-/**
- * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
- * @word: long word containing the bit lock
- */
-int nfs_wait_bit_killable(struct wait_bit_key *key)
+static int nfs_wait_killable(int mode)
{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
freezable_schedule_unsafe();
+ if (signal_pending_state(mode, current))
+ return -ERESTARTSYS;
return 0;
}
+
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+{
+ return nfs_wait_killable(mode);
+}
EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
+int nfs_wait_atomic_killable(atomic_t *p)
+{
+ return nfs_wait_killable(TASK_KILLABLE);
+}
+
/**
* nfs_compat_user_ino64 - returns the user-visible inode number
* @fileid: 64-bit fileid
@@ -135,7 +141,7 @@ void nfs_evict_inode(struct inode *inode)
int nfs_sync_inode(struct inode *inode)
{
- nfs_inode_dio_wait(inode);
+ inode_dio_wait(inode);
return nfs_wb_all(inode);
}
EXPORT_SYMBOL_GPL(nfs_sync_inode);
@@ -408,9 +414,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_fop = NULL;
inode->i_flags |= S_AUTOMOUNT;
}
- } else if (S_ISLNK(inode->i_mode))
+ } else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &nfs_symlink_inode_operations;
- else
+ inode_nohighmem(inode);
+ } else
init_special_inode(inode, inode->i_mode, fattr->rdev);
memset(&inode->i_atime, 0, sizeof(inode->i_atime));
@@ -618,7 +625,10 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
nfs_vmtruncate(inode, attr->ia_size);
}
- nfs_update_inode(inode, fattr);
+ if (fattr->valid)
+ nfs_update_inode(inode, fattr);
+ else
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR;
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
@@ -651,9 +661,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
if (S_ISREG(inode->i_mode)) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = nfs_sync_inode(inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (err)
goto out;
}
@@ -696,7 +706,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
l_ctx->lockowner.l_owner = current->files;
l_ctx->lockowner.l_pid = current->tgid;
INIT_LIST_HEAD(&l_ctx->list);
- nfs_iocounter_init(&l_ctx->io_count);
+ atomic_set(&l_ctx->io_count, 0);
}
static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
@@ -909,6 +919,12 @@ void nfs_file_clear_open_context(struct file *filp)
if (ctx) {
struct inode *inode = d_inode(ctx->dentry);
+ /*
+ * We fatal error on write before. Try to writeback
+ * every page again.
+ */
+ if (ctx->error < 0)
+ invalidate_inode_pages2(inode->i_mapping);
filp->private_data = NULL;
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -924,7 +940,7 @@ int nfs_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
if (IS_ERR(ctx))
return PTR_ERR(ctx);
nfs_file_set_open_context(filp, ctx);
@@ -1083,6 +1099,27 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
|| NFS_STALE(inode);
}
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
+ int ret = 0;
+
+ if (IS_SWAPFILE(inode))
+ goto out;
+ if (nfs_mapping_need_revalidate_inode(inode)) {
+ ret = -ECHILD;
+ goto out;
+ }
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+ (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+ ret = -ECHILD;
+ spin_unlock(&inode->i_lock);
+out:
+ return ret;
+}
+
/**
* __nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
@@ -1141,9 +1178,9 @@ static int __nfs_revalidate_mapping(struct inode *inode,
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
if (may_lock) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = nfs_invalidate_mapping(inode, mapping);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
} else
ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
@@ -1638,6 +1675,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long invalid = 0;
unsigned long now = jiffies;
unsigned long save_cache_validity;
+ bool cache_revalidated = true;
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
@@ -1699,22 +1737,28 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_force_lookup_revalidate(inode);
inode->i_version = fattr->change_attr;
}
- } else
+ } else {
nfsi->cache_validity |= save_cache_validity;
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
- } else if (server->caps & NFS_CAP_MTIME)
+ } else if (server->caps & NFS_CAP_MTIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
- } else if (server->caps & NFS_CAP_CTIME)
+ } else if (server->caps & NFS_CAP_CTIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
/* Check if our cached file size is stale */
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1734,19 +1778,23 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
(long long)cur_isize,
(long long)new_isize);
}
- } else
+ } else {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_PAGECACHE
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
- else if (server->caps & NFS_CAP_ATIME)
+ else if (server->caps & NFS_CAP_ATIME) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATIME
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
@@ -1755,36 +1803,42 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_mode = newmode;
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
}
- } else if (server->caps & NFS_CAP_MODE)
+ } else if (server->caps & NFS_CAP_MODE) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
if (!uid_eq(inode->i_uid, fattr->uid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_uid = fattr->uid;
}
- } else if (server->caps & NFS_CAP_OWNER)
+ } else if (server->caps & NFS_CAP_OWNER) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
if (!gid_eq(inode->i_gid, fattr->gid)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
inode->i_gid = fattr->gid;
}
- } else if (server->caps & NFS_CAP_OWNER_GROUP)
+ } else if (server->caps & NFS_CAP_OWNER_GROUP) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
if (inode->i_nlink != fattr->nlink) {
@@ -1793,19 +1847,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_DATA;
set_nlink(inode, fattr->nlink);
}
- } else if (server->caps & NFS_CAP_NLINK)
+ } else if (server->caps & NFS_CAP_NLINK) {
nfsi->cache_validity |= save_cache_validity &
(NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
+ cache_revalidated = false;
+ }
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- }
- if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
+ else
+ cache_revalidated = false;
/* Update attrtimeo value if we're out of the unstable period */
if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1815,16 +1872,24 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Set barrier to be more recent than all outstanding updates */
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
} else {
- if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
- if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
- nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ if (cache_revalidated) {
+ if (!time_in_range_open(now, nfsi->attrtimeo_timestamp,
+ nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+ nfsi->attrtimeo <<= 1;
+ if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode))
+ nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
+ }
nfsi->attrtimeo_timestamp = now;
}
/* Set the barrier to be more recent than this fattr */
if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
nfsi->attr_gencount = fattr->gencount;
}
- invalid &= ~NFS_INO_INVALID_ATTR;
+
+ /* Don't declare attrcache up to date if there were no attrs! */
+ if (cache_revalidated)
+ invalid &= ~NFS_INO_INVALID_ATTR;
+
/* Don't invalidate the data if we were to blame */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
|| S_ISLNK(inode->i_mode)))
@@ -1904,7 +1969,7 @@ static int __init nfs_init_inodecache(void)
nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
sizeof(struct nfs_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (nfs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 56cfde26fb9c..f1d1d2c472e9 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -238,7 +238,7 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr,
void (*release)(struct nfs_pgio_header *hdr));
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
-int nfs_iocounter_wait(struct nfs_io_counter *c);
+int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);
extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
@@ -252,18 +252,18 @@ void nfs_free_request(struct nfs_page *req);
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
-static inline void nfs_iocounter_init(struct nfs_io_counter *c)
-{
- c->flags = 0;
- atomic_set(&c->io_count, 0);
-}
-
static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
{
WARN_ON_ONCE(desc->pg_mirror_count < 1);
return desc->pg_mirror_count > 1;
}
+static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
+ const struct nfs_open_context *ctx2)
+{
+ return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
+}
+
/* nfs2xdr.c */
extern struct rpc_procinfo nfs_procedures[];
extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -358,7 +358,7 @@ int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
/* file.c */
-int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
+int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
loff_t nfs_file_llseek(struct file *, loff_t, int);
ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
@@ -379,7 +379,8 @@ extern int nfs_drop_inode(struct inode *);
extern void nfs_clear_inode(struct inode *);
extern void nfs_evict_inode(struct inode *);
void nfs_zap_acl_cache(struct inode *inode);
-extern int nfs_wait_bit_killable(struct wait_bit_key *key);
+extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
+extern int nfs_wait_atomic_killable(atomic_t *p);
/* super.c */
extern const struct super_operations nfs_sops;
@@ -483,7 +484,7 @@ void nfs_retry_commit(struct list_head *page_list,
struct nfs_commit_info *cinfo,
u32 ds_commit_idx);
void nfs_commitdata_release(struct nfs_commit_data *data);
-void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+void nfs_request_add_commit_list(struct nfs_page *req,
struct nfs_commit_info *cinfo);
void nfs_request_add_commit_list_locked(struct nfs_page *req,
struct list_head *dst,
@@ -514,12 +515,7 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
/* direct.c */
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq);
-static inline void nfs_inode_dio_wait(struct inode *inode)
-{
- inode_dio_wait(inode);
-}
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
-extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
/* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -642,11 +638,11 @@ unsigned int nfs_page_length(struct page *page)
if (i_size > 0) {
pgoff_t page_index = page_file_index(page);
- pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
if (page_index < end_index)
- return PAGE_CACHE_SIZE;
+ return PAGE_SIZE;
if (page_index == end_index)
- return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
+ return ((i_size - 1) & ~PAGE_MASK) + 1;
}
return 0;
}
@@ -696,9 +692,32 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
{
return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
}
+static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
+{
+ return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
+ NFS4_STATEID_OTHER_SIZE);
+}
#else
static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
{
return 0;
}
+static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
+{
+ return 0;
+}
#endif
+
+static inline bool nfs_error_is_fatal(int err)
+{
+ switch (err) {
+ case -ERESTARTSYS:
+ case -EIO:
+ case -ENOSPC:
+ case -EROFS:
+ case -E2BIG:
+ return true;
+ default:
+ return false;
+ }
+}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 99a45283b9ee..09b190015df4 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -16,9 +16,7 @@
#include <linux/nfs_fs.h>
#include "internal.h"
-#ifdef NFS_DEBUG
-# define NFSDBG_FACILITY NFSDBG_MOUNT
-#endif
+#define NFSDBG_FACILITY NFSDBG_MOUNT
/*
* Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 1ebe2fc7cda2..17c0fa1eccfa 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -284,12 +284,12 @@ nfs3_listxattr(struct dentry *dentry, char *data, size_t size)
int error;
error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS,
- POSIX_ACL_XATTR_ACCESS, data, size, &result);
+ XATTR_NAME_POSIX_ACL_ACCESS, data, size, &result);
if (error)
return error;
error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT,
- POSIX_ACL_XATTR_DEFAULT, data, size, &result);
+ XATTR_NAME_POSIX_ACL_DEFAULT, data, size, &result);
if (error)
return error;
return result;
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 814c1255f1d2..b587ccd31083 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,5 +17,6 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
+int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 0f020e4d8421..dff83460e5a6 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -14,31 +14,10 @@
#include "pnfs.h"
#include "internal.h"
-#define NFSDBG_FACILITY NFSDBG_PNFS
-
-static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
- fmode_t fmode)
-{
- struct nfs_open_context *open;
- struct nfs_lock_context *lock;
- int ret;
-
- open = get_nfs_open_context(nfs_file_open_context(file));
- lock = nfs_get_lock_context(open);
- if (IS_ERR(lock)) {
- put_nfs_open_context(open);
- return PTR_ERR(lock);
- }
-
- ret = nfs4_set_rw_stateid(dst, open, lock, fmode);
-
- nfs_put_lock_context(lock);
- put_nfs_open_context(open);
- return ret;
-}
+#define NFSDBG_FACILITY NFSDBG_PROC
static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
- loff_t offset, loff_t len)
+ struct nfs_lock_context *lock, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(filep);
struct nfs_server *server = NFS_SERVER(inode);
@@ -56,7 +35,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
msg->rpc_argp = &args;
msg->rpc_resp = &res;
- status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE);
+ status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context,
+ lock, FMODE_WRITE);
if (status)
return status;
@@ -78,15 +58,26 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
{
struct nfs_server *server = NFS_SERVER(file_inode(filep));
struct nfs4_exception exception = { };
+ struct nfs_lock_context *lock;
int err;
+ lock = nfs_get_lock_context(nfs_file_open_context(filep));
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ exception.inode = file_inode(filep);
+ exception.state = lock->open_context->state;
+
do {
- err = _nfs42_proc_fallocate(msg, filep, offset, len);
- if (err == -ENOTSUPP)
- return -EOPNOTSUPP;
+ err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
err = nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
+ nfs_put_lock_context(lock);
return err;
}
@@ -101,13 +92,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
return -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
@@ -123,7 +114,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
return -EOPNOTSUPP;
nfs_wb_all(inode);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == 0)
@@ -131,11 +122,12 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
-static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+static loff_t _nfs42_proc_llseek(struct file *filep,
+ struct nfs_lock_context *lock, loff_t offset, int whence)
{
struct inode *inode = file_inode(filep);
struct nfs42_seek_args args = {
@@ -156,7 +148,8 @@ static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
if (!nfs_server_capable(inode, NFS_CAP_SEEK))
return -ENOTSUPP;
- status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
+ status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context,
+ lock, FMODE_READ);
if (status)
return status;
@@ -175,17 +168,28 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
struct nfs_server *server = NFS_SERVER(file_inode(filep));
struct nfs4_exception exception = { };
+ struct nfs_lock_context *lock;
loff_t err;
+ lock = nfs_get_lock_context(nfs_file_open_context(filep));
+ if (IS_ERR(lock))
+ return PTR_ERR(lock);
+
+ exception.inode = file_inode(filep);
+ exception.state = lock->open_context->state;
+
do {
- err = _nfs42_proc_llseek(filep, offset, whence);
+ err = _nfs42_proc_llseek(filep, lock, offset, whence);
if (err >= 0)
break;
- if (err == -ENOTSUPP)
- return -EOPNOTSUPP;
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
err = nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
+ nfs_put_lock_context(lock);
return err;
}
@@ -204,6 +208,8 @@ static void
nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
{
struct nfs42_layoutstat_data *data = calldata;
+ struct inode *inode = data->inode;
+ struct pnfs_layout_hdr *lo;
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
@@ -211,12 +217,35 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
break;
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (lo && nfs4_stateid_match(&data->args.stateid,
+ &lo->plh_stateid)) {
+ LIST_HEAD(head);
+
+ /*
+ * Mark the bad layout state as invalid, then retry
+ * with the current stateid.
+ */
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
+ } else
+ spin_unlock(&inode->i_lock);
+ break;
case -ENOTSUPP:
case -EOPNOTSUPP:
- NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
default:
- dprintk("%s server returns %d\n", __func__, task->tk_status);
+ break;
}
+
+ dprintk("%s server returns %d\n", __func__, task->tk_status);
}
static void
@@ -271,3 +300,104 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
return PTR_ERR(task);
return 0;
}
+
+static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
+ struct file *dst_f, struct nfs_lock_context *src_lock,
+ struct nfs_lock_context *dst_lock, loff_t src_offset,
+ loff_t dst_offset, loff_t count)
+{
+ struct inode *src_inode = file_inode(src_f);
+ struct inode *dst_inode = file_inode(dst_f);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct nfs42_clone_args args = {
+ .src_fh = NFS_FH(src_inode),
+ .dst_fh = NFS_FH(dst_inode),
+ .src_offset = src_offset,
+ .dst_offset = dst_offset,
+ .count = count,
+ .dst_bitmask = server->cache_consistency_bitmask,
+ };
+ struct nfs42_clone_res res = {
+ .server = server,
+ };
+ int status;
+
+ msg->rpc_argp = &args;
+ msg->rpc_resp = &res;
+
+ status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+ src_lock, FMODE_READ);
+ if (status)
+ return status;
+
+ status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+ dst_lock, FMODE_WRITE);
+ if (status)
+ return status;
+
+ res.dst_fattr = nfs_alloc_fattr();
+ if (!res.dst_fattr)
+ return -ENOMEM;
+
+ status = nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == 0)
+ status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
+
+ kfree(res.dst_fattr);
+ return status;
+}
+
+int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
+ loff_t src_offset, loff_t dst_offset, loff_t count)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE],
+ };
+ struct inode *inode = file_inode(src_f);
+ struct nfs_server *server = NFS_SERVER(file_inode(src_f));
+ struct nfs_lock_context *src_lock;
+ struct nfs_lock_context *dst_lock;
+ struct nfs4_exception src_exception = { };
+ struct nfs4_exception dst_exception = { };
+ int err, err2;
+
+ if (!nfs_server_capable(inode, NFS_CAP_CLONE))
+ return -EOPNOTSUPP;
+
+ src_lock = nfs_get_lock_context(nfs_file_open_context(src_f));
+ if (IS_ERR(src_lock))
+ return PTR_ERR(src_lock);
+
+ src_exception.inode = file_inode(src_f);
+ src_exception.state = src_lock->open_context->state;
+
+ dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f));
+ if (IS_ERR(dst_lock)) {
+ err = PTR_ERR(dst_lock);
+ goto out_put_src_lock;
+ }
+
+ dst_exception.inode = file_inode(dst_f);
+ dst_exception.state = dst_lock->open_context->state;
+
+ do {
+ err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock,
+ src_offset, dst_offset, count);
+ if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ err2 = nfs4_handle_exception(server, err, &src_exception);
+ err = nfs4_handle_exception(server, err, &dst_exception);
+ if (!err)
+ err = err2;
+ } while (src_exception.retry || dst_exception.retry);
+
+ nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+ nfs_put_lock_context(src_lock);
+ return err;
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0eb29e14070d..0ca482a51e53 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -34,6 +34,12 @@
1 /* opaque devaddr4 length */ + \
XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
+#define encode_clone_maxsz (encode_stateid_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* src offset */ + \
+ 2 /* dst offset */ + \
+ 2 /* count */)
+#define decode_clone_maxsz (op_decode_hdr_maxsz)
#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
@@ -65,7 +71,20 @@
decode_sequence_maxsz + \
decode_putfh_maxsz + \
PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
-
+#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_clone_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_clone_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_clone_maxsz + \
+ decode_getattr_maxsz)
static void encode_fallocate(struct xdr_stream *xdr,
struct nfs42_falloc_args *args)
@@ -128,6 +147,21 @@ static void encode_layoutstats(struct xdr_stream *xdr,
encode_uint32(xdr, 0);
}
+static void encode_clone(struct xdr_stream *xdr,
+ struct nfs42_clone_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_CLONE, decode_clone_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+ p = reserve_space(xdr, 3*8);
+ p = xdr_encode_hyper(p, args->src_offset);
+ p = xdr_encode_hyper(p, args->dst_offset);
+ xdr_encode_hyper(p, args->count);
+}
+
/*
* Encode ALLOCATE request
*/
@@ -206,6 +240,27 @@ static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
encode_nops(&hdr);
}
+/*
+ * Encode CLONE request
+ */
+static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_clone_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_clone(xdr, args, &hdr);
+ encode_getfattr(xdr, args->dst_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -243,6 +298,11 @@ static int decode_layoutstats(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_LAYOUTSTATS);
}
+static int decode_clone(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_CLONE);
+}
+
/*
* Decode ALLOCATE request
*/
@@ -351,4 +411,39 @@ out:
return status;
}
+/*
+ * Decode CLONE request
+ */
+static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_clone_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_clone(xdr);
+ if (status)
+ goto out;
+ status = decode_getfattr(xdr, res->dst_fattr, res->server);
+
+out:
+ res->rpc_status = status;
+ return status;
+}
+
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 50cfc4ca7a02..4afdee420d25 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -183,10 +183,12 @@ struct nfs4_state {
struct nfs4_exception {
- long timeout;
- int retry;
struct nfs4_state *state;
struct inode *inode;
+ long timeout;
+ unsigned char delay : 1,
+ recovering : 1,
+ retry : 1;
};
struct nfs4_state_recovery_ops {
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 223bedda64ae..10410e8b5853 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -33,7 +33,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion)
return ret;
idr_preload(GFP_KERNEL);
spin_lock(&nn->nfs_client_lock);
- ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT);
+ ret = idr_alloc(&nn->cb_ident_idr, clp, 1, 0, GFP_NOWAIT);
if (ret >= 0)
clp->cl_cb_ident = ret;
spin_unlock(&nn->nfs_client_lock);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index b0dbe0abed53..d0390516467c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -4,8 +4,10 @@
* Copyright (C) 1992 Rick Sladkey
*/
#include <linux/fs.h>
+#include <linux/file.h>
#include <linux/falloc.h>
#include <linux/nfs_fs.h>
+#include <uapi/linux/btrfs.h> /* BTRFS_IOC_CLONE/BTRFS_IOC_CLONE_RANGE */
#include "delegation.h"
#include "internal.h"
#include "iostat.h"
@@ -24,7 +26,7 @@ static int
nfs4_file_open(struct inode *inode, struct file *filp)
{
struct nfs_open_context *ctx;
- struct dentry *dentry = filp->f_path.dentry;
+ struct dentry *dentry = file_dentry(filp);
struct dentry *parent = NULL;
struct inode *dir;
unsigned openflags = filp->f_flags;
@@ -55,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+ ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
@@ -126,37 +128,6 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
return vfs_fsync(file, 0);
}
-static int
-nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
- int ret;
- struct inode *inode = file_inode(file);
-
- trace_nfs_fsync_enter(inode);
-
- nfs_inode_dio_wait(inode);
- do {
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (ret != 0)
- break;
- mutex_lock(&inode->i_mutex);
- ret = nfs_file_fsync_commit(file, start, end, datasync);
- if (!ret)
- ret = pnfs_sync_inode(inode, !!datasync);
- mutex_unlock(&inode->i_mutex);
- /*
- * If nfs_file_fsync_commit detected a server reboot, then
- * resend all dirty pages that might have been covered by
- * the NFS_CONTEXT_RESEND_WRITES flag
- */
- start = 0;
- end = LLONG_MAX;
- } while (ret == -EAGAIN);
-
- trace_nfs_fsync_exit(inode, ret);
- return ret;
-}
-
#ifdef CONFIG_NFS_V4_2
static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
{
@@ -192,28 +163,90 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
return nfs42_proc_deallocate(filep, offset, len);
return nfs42_proc_allocate(filep, offset, len);
}
+
+static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off, u64 count)
+{
+ struct inode *dst_inode = file_inode(dst_file);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct inode *src_inode = file_inode(src_file);
+ unsigned int bs = server->clone_blksize;
+ bool same_inode = false;
+ int ret;
+
+ /* check alignment w.r.t. clone_blksize */
+ ret = -EINVAL;
+ if (bs) {
+ if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs))
+ goto out;
+ if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count))
+ goto out;
+ }
+
+ if (src_inode == dst_inode)
+ same_inode = true;
+
+ /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
+ if (same_inode) {
+ inode_lock(src_inode);
+ } else if (dst_inode < src_inode) {
+ inode_lock_nested(dst_inode, I_MUTEX_PARENT);
+ inode_lock_nested(src_inode, I_MUTEX_CHILD);
+ } else {
+ inode_lock_nested(src_inode, I_MUTEX_PARENT);
+ inode_lock_nested(dst_inode, I_MUTEX_CHILD);
+ }
+
+ /* flush all pending writes on both src and dst so that server
+ * has the latest data */
+ ret = nfs_sync_inode(src_inode);
+ if (ret)
+ goto out_unlock;
+ ret = nfs_sync_inode(dst_inode);
+ if (ret)
+ goto out_unlock;
+
+ ret = nfs42_proc_clone(src_file, dst_file, src_off, dst_off, count);
+
+ /* truncate inode page cache of the dst range so that future reads can fetch
+ * new data from server */
+ if (!ret)
+ truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
+
+out_unlock:
+ if (same_inode) {
+ inode_unlock(src_inode);
+ } else if (dst_inode < src_inode) {
+ inode_unlock(src_inode);
+ inode_unlock(dst_inode);
+ } else {
+ inode_unlock(dst_inode);
+ inode_unlock(src_inode);
+ }
+out:
+ return ret;
+}
#endif /* CONFIG_NFS_V4_2 */
const struct file_operations nfs4_file_operations = {
-#ifdef CONFIG_NFS_V4_2
- .llseek = nfs4_file_llseek,
-#else
- .llseek = nfs_file_llseek,
-#endif
.read_iter = nfs_file_read,
.write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
.open = nfs4_file_open,
.flush = nfs4_file_flush,
.release = nfs_file_release,
- .fsync = nfs4_file_fsync,
+ .fsync = nfs_file_fsync,
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
.splice_write = iter_file_splice_write,
-#ifdef CONFIG_NFS_V4_2
- .fallocate = nfs42_fallocate,
-#endif /* CONFIG_NFS_V4_2 */
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
+#ifdef CONFIG_NFS_V4_2
+ .llseek = nfs4_file_llseek,
+ .fallocate = nfs42_fallocate,
+ .clone_file_range = nfs42_clone_file_range,
+#else
+ .llseek = nfs_file_llseek,
+#endif
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0e5ff69455c7..327b8c34d360 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -78,7 +78,6 @@ struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -209,6 +208,9 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
| FATTR4_WORD1_TIME_METADATA
| FATTR4_WORD1_TIME_MODIFY,
FATTR4_WORD2_MDSTHRESHOLD
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+ | FATTR4_WORD2_SECURITY_LABEL
+#endif
};
static const u32 nfs4_open_noattr_bitmap[3] = {
@@ -239,6 +241,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
FATTR4_WORD1_TIME_DELTA
| FATTR4_WORD1_FS_LAYOUT_TYPES,
FATTR4_WORD2_LAYOUT_BLKSIZE
+ | FATTR4_WORD2_CLONE_BLKSIZE
};
const u32 nfs4_fs_locations_bitmap[3] = {
@@ -344,13 +347,16 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
/* This is the error handling routine for processes that are allowed
* to sleep.
*/
-int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_do_handle_exception(struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state;
struct inode *inode = exception->inode;
int ret = errorcode;
+ exception->delay = 0;
+ exception->recovering = 0;
exception->retry = 0;
switch(errorcode) {
case 0:
@@ -359,11 +365,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
- nfs4_inode_return_delegation(inode);
- exception->retry = 1;
- return 0;
- }
+ if (inode && nfs_async_inode_return_delegation(inode,
+ NULL) == 0)
+ goto wait_on_recovery;
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -409,11 +413,12 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
ret = -EBUSY;
break;
}
- case -NFS4ERR_GRACE:
case -NFS4ERR_DELAY:
- ret = nfs4_delay(server->client, &exception->timeout);
- if (ret != 0)
- break;
+ nfs_inc_server_stats(server, NFSIOS_DELAY);
+ case -NFS4ERR_GRACE:
+ exception->delay = 1;
+ return 0;
+
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_OLD_STATEID:
exception->retry = 1;
@@ -434,14 +439,85 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
/* We failed to handle the error */
return nfs4_map_errors(ret);
wait_on_recovery:
- ret = nfs4_wait_clnt_recover(clp);
+ exception->recovering = 1;
+ return 0;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ ret = nfs4_delay(server->client, &exception->timeout);
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ ret = nfs4_wait_clnt_recover(clp);
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ return -EIO;
+ goto out_retry;
+ }
+ return ret;
+out_retry:
+ if (ret == 0)
+ exception->retry = 1;
+ return ret;
+}
+
+static int
+nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ rpc_delay(task, nfs4_update_delay(&exception->timeout));
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+ rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+ goto out_retry;
+ }
if (test_bit(NFS_MIG_FAILED, &server->mig_status))
- return -EIO;
+ ret = -EIO;
+ return ret;
+out_retry:
if (ret == 0)
exception->retry = 1;
return ret;
}
+static int
+nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server,
+ struct nfs4_state *state, long *timeout)
+{
+ struct nfs4_exception exception = {
+ .state = state,
+ };
+
+ if (task->tk_status >= 0)
+ return 0;
+ if (timeout)
+ exception.timeout = *timeout;
+ task->tk_status = nfs4_async_handle_exception(task, server,
+ task->tk_status,
+ &exception);
+ if (exception.delay && timeout)
+ *timeout = exception.timeout;
+ if (exception.retry)
+ return -EAGAIN;
+ return 0;
+}
+
/*
* Return 'true' if 'clp' is using an rpc_client that is integrity protected
* or 'false' otherwise.
@@ -1312,6 +1388,7 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
* Protect the call to nfs4_state_set_mode_locked and
* serialise the stateid update
*/
+ spin_lock(&state->owner->so_lock);
write_seqlock(&state->seqlock);
if (deleg_stateid != NULL) {
nfs4_stateid_copy(&state->stateid, deleg_stateid);
@@ -1320,7 +1397,6 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
if (open_stateid != NULL)
nfs_set_open_stateid_locked(state, open_stateid, fmode);
write_sequnlock(&state->seqlock);
- spin_lock(&state->owner->so_lock);
update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
}
@@ -1525,6 +1601,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
if (!data->rpc_done) {
state = nfs4_try_open_cached(data);
+ trace_nfs4_cached_open(data->state);
goto out;
}
@@ -1942,6 +2019,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
}
return;
unlock_no_action:
+ trace_nfs4_cached_open(data->state);
rcu_read_unlock();
out_no_action:
task->tk_action = NULL;
@@ -2383,14 +2461,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
dentry = opendata->dentry;
if (d_really_is_negative(dentry)) {
- /* FIXME: Is this d_drop() ever needed? */
+ struct dentry *alias;
d_drop(dentry);
- dentry = d_add_unique(dentry, igrab(state->inode));
- if (dentry == NULL) {
- dentry = opendata->dentry;
- } else if (dentry != ctx->dentry) {
+ alias = d_exact_alias(dentry, state->inode);
+ if (!alias)
+ alias = d_splice_alias(igrab(state->inode), dentry);
+ /* d_splice_alias() can't fail here - it's a non-directory */
+ if (alias) {
dput(ctx->dentry);
- ctx->dentry = dget(dentry);
+ ctx->dentry = dentry = alias;
}
nfs_set_verifier(dentry,
nfs_save_change_attribute(d_inode(opendata->dir)));
@@ -2630,6 +2709,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (status == 0 && state != NULL)
renew_lease(server, timestamp);
+ trace_nfs4_setattr(inode, &arg.stateid, status);
return status;
}
@@ -2646,7 +2726,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
int err;
do {
err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
- trace_nfs4_setattr(inode, err);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -4530,7 +4609,7 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
static int buf_to_pages_noslab(const void *buf, size_t buflen,
- struct page **pages, unsigned int *pgbase)
+ struct page **pages)
{
struct page *newpage, **spages;
int rc = 0;
@@ -4674,7 +4753,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
goto out_free;
args.acl_len = npages * PAGE_SIZE;
- args.acl_pgbase = 0;
dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
__func__, buf, buflen, npages, args.acl_len);
@@ -4766,7 +4844,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
return -EOPNOTSUPP;
if (npages > ARRAY_SIZE(pages))
return -ERANGE;
- i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+ i = buf_to_pages_noslab(buf, buflen, arg.acl_pages);
if (i < 0)
return i;
nfs4_inode_return_delegation(inode);
@@ -4955,79 +5033,6 @@ out:
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
-static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
- struct nfs4_state *state, long *timeout)
-{
- struct nfs_client *clp = server->nfs_client;
-
- if (task->tk_status >= 0)
- return 0;
- switch(task->tk_status) {
- case -NFS4ERR_DELEG_REVOKED:
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_OPENMODE:
- if (state == NULL)
- break;
- if (nfs4_schedule_stateid_recovery(server, state) < 0)
- goto recovery_failed;
- goto wait_on_recovery;
- case -NFS4ERR_EXPIRED:
- if (state != NULL) {
- if (nfs4_schedule_stateid_recovery(server, state) < 0)
- goto recovery_failed;
- }
- case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_STALE_CLIENTID:
- nfs4_schedule_lease_recovery(clp);
- goto wait_on_recovery;
- case -NFS4ERR_MOVED:
- if (nfs4_schedule_migration_recovery(server) < 0)
- goto recovery_failed;
- goto wait_on_recovery;
- case -NFS4ERR_LEASE_MOVED:
- nfs4_schedule_lease_moved_recovery(clp);
- goto wait_on_recovery;
-#if defined(CONFIG_NFS_V4_1)
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_DEADSESSION:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- case -NFS4ERR_SEQ_FALSE_RETRY:
- case -NFS4ERR_SEQ_MISORDERED:
- dprintk("%s ERROR %d, Reset session\n", __func__,
- task->tk_status);
- nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
- goto wait_on_recovery;
-#endif /* CONFIG_NFS_V4_1 */
- case -NFS4ERR_DELAY:
- nfs_inc_server_stats(server, NFSIOS_DELAY);
- rpc_delay(task, nfs4_update_delay(timeout));
- goto restart_call;
- case -NFS4ERR_GRACE:
- rpc_delay(task, NFS4_POLL_RETRY_MAX);
- case -NFS4ERR_RETRY_UNCACHED_REP:
- case -NFS4ERR_OLD_STATEID:
- goto restart_call;
- }
- task->tk_status = nfs4_map_errors(task->tk_status);
- return 0;
-recovery_failed:
- task->tk_status = -EIO;
- return 0;
-wait_on_recovery:
- rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
- if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
- rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
- if (test_bit(NFS_MIG_FAILED, &server->mig_status))
- goto recovery_failed;
-restart_call:
- task->tk_status = 0;
- return -EAGAIN;
-}
-
static void nfs4_init_boot_verifier(const struct nfs_client *clp,
nfs4_verifier *bootverf)
{
@@ -5049,7 +5054,6 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
static int
nfs4_init_nonuniform_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5077,7 +5081,7 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
return -ENOMEM;
rcu_read_lock();
- result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
+ scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
clp->cl_ipaddr,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
@@ -5090,7 +5094,6 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
static int
nfs4_init_uniquifier_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5110,7 +5113,7 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
if (!str)
return -ENOMEM;
- result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+ scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
clp->rpc_ops->version, clp->cl_minorversion,
nfs4_client_id_uniquifier,
clp->cl_rpcclient->cl_nodename);
@@ -5121,7 +5124,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
static int
nfs4_init_uniform_client_string(struct nfs_client *clp)
{
- int result;
size_t len;
char *str;
@@ -5146,7 +5148,7 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
if (!str)
return -ENOMEM;
- result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+ scnprintf(str, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
clp->cl_rpcclient->cl_nodename);
clp->cl_owner_id = str;
@@ -5385,6 +5387,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
if (data == NULL)
return -ENOMEM;
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+
+ nfs4_state_protect(server->nfs_client,
+ NFS_SP4_MACH_CRED_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
data->args.fhandle = &data->fh;
data->args.stateid = &data->stateid;
data->args.bitmask = server->cache_consistency_bitmask;
@@ -5427,7 +5434,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
int err;
do {
err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
- trace_nfs4_delegreturn(inode, err);
+ trace_nfs4_delegreturn(inode, stateid, err);
switch (err) {
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
@@ -5522,7 +5529,7 @@ struct nfs4_unlockdata {
struct nfs4_lock_state *lsp;
struct nfs_open_context *ctx;
struct file_lock fl;
- const struct nfs_server *server;
+ struct nfs_server *server;
unsigned long timestamp;
};
@@ -5937,6 +5944,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
data->cancelled = 1;
rpc_put_task(task);
dprintk("%s: done, ret = %d!\n", __func__, ret);
+ trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret);
return ret;
}
@@ -5953,7 +5961,6 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
- trace_nfs4_lock_reclaim(request, state, F_SETLK, err);
if (err != -NFS4ERR_DELAY)
break;
nfs4_handle_exception(server, err, &exception);
@@ -5980,7 +5987,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
return 0;
err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
- trace_nfs4_lock_expired(request, state, F_SETLK, err);
switch (err) {
default:
goto out;
@@ -6088,7 +6094,6 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
do {
err = _nfs4_proc_setlk(state, cmd, request);
- trace_nfs4_set_lock(request, state, cmd, err);
if (err == -NFS4ERR_DENIED)
err = -EAGAIN;
err = nfs4_handle_exception(NFS_SERVER(state->inode),
@@ -6249,48 +6254,32 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
-static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
+static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
const void *buf, size_t buflen,
- int flags, int type)
+ int flags)
{
- if (strcmp(key, "") != 0)
- return -EINVAL;
-
return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
}
-static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
- void *buf, size_t buflen, int type)
+static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ void *buf, size_t buflen)
{
- if (strcmp(key, "") != 0)
- return -EINVAL;
-
return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
}
-static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len, int type)
+static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
{
- size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
-
- if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))))
- return 0;
-
- if (list && len <= list_len)
- memcpy(list, XATTR_NAME_NFSV4_ACL, len);
- return len;
+ return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)));
}
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
-static inline int nfs4_server_supports_labels(struct nfs_server *server)
-{
- return server->caps & NFS_CAP_SECURITY_LABEL;
-}
-static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
- const void *buf, size_t buflen,
- int flags, int type)
+static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ const void *buf, size_t buflen,
+ int flags)
{
if (security_ismaclabel(key))
return nfs4_set_security_label(dentry, buf, buflen);
@@ -6298,36 +6287,43 @@ static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
return -EOPNOTSUPP;
}
-static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
- void *buf, size_t buflen, int type)
+static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ void *buf, size_t buflen)
{
if (security_ismaclabel(key))
return nfs4_get_security_label(d_inode(dentry), buf, buflen);
return -EOPNOTSUPP;
}
-static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len, int type)
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
{
- size_t len = 0;
+ int len = 0;
- if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) {
- len = security_inode_listsecurity(d_inode(dentry), NULL, 0);
- if (list && len <= list_len)
- security_inode_listsecurity(d_inode(dentry), list, len);
+ if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) {
+ len = security_inode_listsecurity(inode, list, list_len);
+ if (list_len && len > list_len)
+ return -ERANGE;
}
return len;
}
static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = nfs4_xattr_list_nfs4_label,
.get = nfs4_xattr_get_nfs4_label,
.set = nfs4_xattr_set_nfs4_label,
};
-#endif
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
+{
+ return 0;
+}
+
+#endif
/*
* nfs_fhget will use either the mounted_on_fileid or the fileid
@@ -6787,13 +6783,26 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
return false;
}
+static void
+nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
+{
+}
+
+static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = {
+ .rpc_call_done = &nfs4_bind_one_conn_to_session_done,
+};
+
/*
- * nfs4_proc_bind_conn_to_session()
+ * nfs4_proc_bind_one_conn_to_session()
*
* The 4.1 client currently uses the same TCP connection for the
* fore and backchannel.
*/
-int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
+static
+int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ struct nfs_client *clp,
+ struct rpc_cred *cred)
{
int status;
struct nfs41_bind_conn_to_session_args args = {
@@ -6808,6 +6817,14 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
.rpc_resp = &res,
.rpc_cred = cred,
};
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .rpc_xprt = xprt,
+ .callback_ops = &nfs4_bind_one_conn_to_session_ops,
+ .rpc_message = &msg,
+ .flags = RPC_TASK_TIMEOUT,
+ };
+ struct rpc_task *task;
dprintk("--> %s\n", __func__);
@@ -6815,7 +6832,16 @@ int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred
if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
args.dir = NFS4_CDFC4_FORE;
- status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+ /* Do not set the backchannel flag unless this is clnt->cl_xprt */
+ if (xprt != rcu_access_pointer(clnt->cl_xprt))
+ args.dir = NFS4_CDFC4_FORE;
+
+ task = rpc_run_task(&task_setup_data);
+ if (!IS_ERR(task)) {
+ status = task->tk_status;
+ rpc_put_task(task);
+ } else
+ status = PTR_ERR(task);
trace_nfs4_bind_conn_to_session(clp, status);
if (status == 0) {
if (memcmp(res.sessionid.data,
@@ -6842,6 +6868,31 @@ out:
return status;
}
+struct rpc_bind_conn_calldata {
+ struct nfs_client *clp;
+ struct rpc_cred *cred;
+};
+
+static int
+nfs4_proc_bind_conn_to_session_callback(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *calldata)
+{
+ struct rpc_bind_conn_calldata *p = calldata;
+
+ return nfs4_proc_bind_one_conn_to_session(clnt, xprt, p->clp, p->cred);
+}
+
+int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
+{
+ struct rpc_bind_conn_calldata data = {
+ .clp = clp,
+ .cred = cred,
+ };
+ return rpc_clnt_iterate_for_each_xprt(clp->cl_rpcclient,
+ nfs4_proc_bind_conn_to_session_callback, &data);
+}
+
/*
* Minimum set of SP4_MACH_CRED operations from RFC 5661 in the enforce map
* and operations we'd like to see to enable certain features in the allow map
@@ -6857,10 +6908,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
},
.allow.u.words = {
[0] = 1 << (OP_CLOSE) |
+ 1 << (OP_OPEN_DOWNGRADE) |
1 << (OP_LOCKU) |
+ 1 << (OP_DELEGRETURN) |
1 << (OP_COMMIT),
[1] = 1 << (OP_SECINFO - 32) |
1 << (OP_SECINFO_NO_NAME - 32) |
+ 1 << (OP_LAYOUTRETURN - 32) |
1 << (OP_TEST_STATEID - 32) |
1 << (OP_FREE_STATEID - 32) |
1 << (OP_WRITE - 32)
@@ -6925,11 +6979,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
}
if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+ test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) &&
+ test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
test_bit(OP_LOCKU, sp->allow.u.longs)) {
dfprintk(MOUNT, " cleanup mode enabled\n");
set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
}
+ if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
+ dfprintk(MOUNT, " pnfs cleanup mode enabled\n");
+ set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &clp->cl_sp4_flags);
+ }
+
if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
dfprintk(MOUNT, " secinfo mode enabled\n");
@@ -7313,7 +7375,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
args->bc_attrs.max_resp_sz = PAGE_SIZE;
args->bc_attrs.max_resp_sz_cached = 0;
args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
- args->bc_attrs.max_reqs = 1;
+ args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
@@ -7758,6 +7820,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
struct nfs4_layoutget *lgp = calldata;
struct nfs_server *server = NFS_SERVER(lgp->args.inode);
struct nfs4_session *session = nfs4_get_session(server);
+ int ret;
dprintk("--> %s\n", __func__);
/* Note the is a race here, where a CB_LAYOUTRECALL can come in
@@ -7768,12 +7831,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
if (nfs41_setup_sequence(session, &lgp->args.seq_args,
&lgp->res.seq_res, task))
return;
- if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+ ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
NFS_I(lgp->args.inode)->layout,
&lgp->args.range,
- lgp->args.ctx->state)) {
- rpc_exit(task, NFS4_OK);
- }
+ lgp->args.ctx->state);
+ if (ret < 0)
+ rpc_exit(task, ret);
}
static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
@@ -7793,6 +7856,15 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
goto out;
+
+ /*
+ * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs
+ * on the file. set tk_status to -ENODATA to tell upper layer to
+ * retry go inband.
+ */
+ case -NFS4ERR_LAYOUTUNAVAILABLE:
+ task->tk_status = -ENODATA;
+ goto out;
/*
* NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
* length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
@@ -7861,7 +7933,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
spin_unlock(&inode->i_lock);
goto out_restart;
}
- if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN)
goto out_restart;
out:
dprintk("<-- %s\n", __func__);
@@ -7989,6 +8061,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
trace_nfs4_layoutget(lgp->args.ctx,
&lgp->args.range,
&lgp->res.range,
+ &lgp->res.stateid,
status);
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
@@ -8045,11 +8118,11 @@ static void nfs4_layoutreturn_release(void *calldata)
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
+ pnfs_mark_layout_returned_if_empty(lo);
if (lrp->res.lrs_present)
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
- pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
pnfs_clear_layoutreturn_waitbit(lo);
- lo->plh_block_lgets--;
spin_unlock(&lo->plh_inode->i_lock);
pnfs_free_lseg_list(&freeme);
pnfs_put_layout_hdr(lrp->args.layout);
@@ -8081,6 +8154,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
};
int status = 0;
+ nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client,
+ NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+ &task_setup_data.rpc_client, &msg);
+
dprintk("--> %s\n", __func__);
if (!sync) {
lrp->inode = nfs_igrab_and_active(lrp->args.inode);
@@ -8096,7 +8173,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
return PTR_ERR(task);
if (sync)
status = task->tk_status;
- trace_nfs4_layoutreturn(lrp->args.inode, status);
+ trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
dprintk("<-- %s status=%d\n", __func__, status);
rpc_put_task(task);
return status;
@@ -8244,7 +8321,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
return PTR_ERR(task);
if (sync)
status = task->tk_status;
- trace_nfs4_layoutcommit(data->args.inode, status);
+ trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status);
dprintk("%s: status %d\n", __func__, status);
rpc_put_task(task);
return status;
@@ -8718,7 +8795,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_ALLOCATE
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
- | NFS_CAP_LAYOUTSTATS,
+ | NFS_CAP_LAYOUTSTATS
+ | NFS_CAP_CLONE,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
@@ -8743,6 +8821,24 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
#endif
};
+ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ ssize_t error, error2;
+
+ error = generic_listxattr(dentry, list, size);
+ if (error < 0)
+ return error;
+ if (list) {
+ list += error;
+ size -= error;
+ }
+
+ error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+ if (error2 < 0)
+ return error2;
+ return error + error2;
+}
+
static const struct inode_operations nfs4_dir_inode_operations = {
.create = nfs_create,
.lookup = nfs_lookup,
@@ -8759,7 +8855,7 @@ static const struct inode_operations nfs4_dir_inode_operations = {
.setattr = nfs_setattr,
.getxattr = generic_getxattr,
.setxattr = generic_setxattr,
- .listxattr = generic_listxattr,
+ .listxattr = nfs4_listxattr,
.removexattr = generic_removexattr,
};
@@ -8769,7 +8865,7 @@ static const struct inode_operations nfs4_file_inode_operations = {
.setattr = nfs_setattr,
.getxattr = generic_getxattr,
.setxattr = generic_setxattr,
- .listxattr = generic_listxattr,
+ .listxattr = nfs4_listxattr,
.removexattr = generic_removexattr,
};
@@ -8828,7 +8924,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
};
static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
- .prefix = XATTR_NAME_NFSV4_ACL,
+ .name = XATTR_NAME_NFSV4_ACL,
.list = nfs4_xattr_list_nfs4_acl,
.get = nfs4_xattr_get_nfs4_acl,
.set = nfs4_xattr_set_nfs4_acl,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index e23366effcfb..332d06e64fa9 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -135,6 +135,43 @@ static struct nfs4_slot *nfs4_find_or_create_slot(struct nfs4_slot_table *tbl,
return ERR_PTR(-ENOMEM);
}
+static void nfs4_lock_slot(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *slot)
+{
+ u32 slotid = slot->slot_nr;
+
+ __set_bit(slotid, tbl->used_slots);
+ if (slotid > tbl->highest_used_slotid ||
+ tbl->highest_used_slotid == NFS4_NO_SLOT)
+ tbl->highest_used_slotid = slotid;
+ slot->generation = tbl->generation;
+}
+
+/*
+ * nfs4_try_to_lock_slot - Given a slot try to allocate it
+ *
+ * Note: must be called with the slot_tbl_lock held.
+ */
+bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
+{
+ if (nfs4_test_locked_slot(tbl, slot->slot_nr))
+ return false;
+ nfs4_lock_slot(tbl, slot);
+ return true;
+}
+
+/*
+ * nfs4_lookup_slot - Find a slot but don't allocate it
+ *
+ * Note: must be called with the slot_tbl_lock held.
+ */
+struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
+{
+ if (slotid <= tbl->max_slotid)
+ return nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+ return ERR_PTR(-E2BIG);
+}
+
/*
* nfs4_alloc_slot - efficiently look for a free slot
*
@@ -153,18 +190,11 @@ struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl)
__func__, tbl->used_slots[0], tbl->highest_used_slotid,
tbl->max_slotid + 1);
slotid = find_first_zero_bit(tbl->used_slots, tbl->max_slotid + 1);
- if (slotid > tbl->max_slotid)
- goto out;
- ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
- if (IS_ERR(ret))
- goto out;
- __set_bit(slotid, tbl->used_slots);
- if (slotid > tbl->highest_used_slotid ||
- tbl->highest_used_slotid == NFS4_NO_SLOT)
- tbl->highest_used_slotid = slotid;
- ret->generation = tbl->generation;
-
-out:
+ if (slotid <= tbl->max_slotid) {
+ ret = nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+ if (!IS_ERR(ret))
+ nfs4_lock_slot(tbl, ret);
+ }
dprintk("<-- %s used_slots=%04lx highest_used=%u slotid=%u\n",
__func__, tbl->used_slots[0], tbl->highest_used_slotid,
!IS_ERR(ret) ? ret->slot_nr : NFS4_NO_SLOT);
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index e3ea2c5324d6..5b51298d1d03 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -77,6 +77,8 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
unsigned int max_reqs, const char *queue);
extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
+extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
+extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl,
@@ -88,6 +90,12 @@ static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl)
return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
}
+static inline bool nfs4_test_locked_slot(const struct nfs4_slot_table *tbl,
+ u32 slotid)
+{
+ return !!test_bit(slotid, tbl->used_slots);
+}
+
#if defined(CONFIG_NFS_V4_1)
extern void nfs41_set_target_slotid(struct nfs4_slot_table *tbl,
u32 target_highest_slotid);
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 0fbd3ab1be22..8693d77c45ea 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -12,7 +12,7 @@
#include "nfs4idmap.h"
#include "callback.h"
-static const int nfs_set_port_min = 0;
+static const int nfs_set_port_min;
static const int nfs_set_port_max = 65535;
static struct ctl_table_header *nfs4_callback_sysctl_table;
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index d774335cc8bc..2850bce19244 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -6,6 +6,7 @@
#include "internal.h"
#include "nfs4session.h"
#include "callback.h"
+#include "pnfs.h"
#define CREATE_TRACE_POINTS
#include "nfs4trace.h"
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 671cf68fe56b..2c8d05dae5b1 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -321,6 +321,7 @@ TRACE_EVENT(nfs4_sequence_done,
__entry->highest_slotid = res->sr_highest_slotid;
__entry->target_highest_slotid =
res->sr_target_highest_slotid;
+ __entry->status_flags = res->sr_status_flags;
__entry->error = res->sr_status;
),
TP_printk(
@@ -399,6 +400,10 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__field(u64, fileid)
__field(u64, dir)
__string(name, ctx->dentry->d_name.name)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, openstateid_seq)
+ __field(u32, openstateid_hash)
),
TP_fast_assign(
@@ -409,8 +414,22 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->flags = flags;
__entry->fmode = (__force unsigned int)ctx->mode;
__entry->dev = ctx->dentry->d_sb->s_dev;
- if (!IS_ERR_OR_NULL(state))
+ if (!IS_ERR_OR_NULL(state)) {
inode = state->inode;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->openstateid_seq =
+ be32_to_cpu(state->open_stateid.seqid);
+ __entry->openstateid_hash =
+ nfs_stateid_hash(&state->open_stateid);
+ } else {
+ __entry->stateid_seq = 0;
+ __entry->stateid_hash = 0;
+ __entry->openstateid_seq = 0;
+ __entry->openstateid_hash = 0;
+ }
if (inode != NULL) {
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -425,7 +444,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
TP_printk(
"error=%d (%s) flags=%d (%s) fmode=%s "
"fileid=%02x:%02x:%llu fhandle=0x%08x "
- "name=%02x:%02x:%llu/%s",
+ "name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
+ "openstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
__entry->flags,
@@ -436,7 +456,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->fhandle,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
- __get_str(name)
+ __get_str(name),
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->openstateid_seq, __entry->openstateid_hash
)
);
@@ -452,6 +474,45 @@ DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
+TRACE_EVENT(nfs4_cached_open,
+ TP_PROTO(
+ const struct nfs4_state *state
+ ),
+ TP_ARGS(state),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(unsigned int, fmode)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->fmode = (__force unsigned int)state->state;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ ),
+
+ TP_printk(
+ "fmode=%s fileid=%02x:%02x:%llu "
+ "fhandle=0x%08x stateid=%d:0x%08x",
+ __entry->fmode ? show_fmode_flags(__entry->fmode) :
+ "closed",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
TRACE_EVENT(nfs4_close,
TP_PROTO(
const struct nfs4_state *state,
@@ -468,6 +529,8 @@ TRACE_EVENT(nfs4_close,
__field(u64, fileid)
__field(unsigned int, fmode)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -478,18 +541,23 @@ TRACE_EVENT(nfs4_close,
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->fmode = (__force unsigned int)state->state;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&args->stateid);
),
TP_printk(
"error=%d (%s) fmode=%s fileid=%02x:%02x:%llu "
- "fhandle=0x%08x",
+ "fhandle=0x%08x openstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
__entry->fmode ? show_fmode_flags(__entry->fmode) :
"closed",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -523,6 +591,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -536,11 +606,16 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) cmd=%s:%s range=%lld:%lld "
- "fileid=%02x:%02x:%llu fhandle=0x%08x",
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
show_lock_cmd(__entry->cmd),
@@ -549,7 +624,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
(long long)__entry->end,
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -563,11 +639,73 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
), \
TP_ARGS(request, state, cmd, error))
DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired);
DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
+TRACE_EVENT(nfs4_set_lock,
+ TP_PROTO(
+ const struct file_lock *request,
+ const struct nfs4_state *state,
+ const nfs4_stateid *lockstateid,
+ int cmd,
+ int error
+ ),
+
+ TP_ARGS(request, state, lockstateid, cmd, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(int, cmd)
+ __field(char, type)
+ __field(loff_t, start)
+ __field(loff_t, end)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, lockstateid_seq)
+ __field(u32, lockstateid_hash)
+ ),
+
+ TP_fast_assign(
+ const struct inode *inode = state->inode;
+
+ __entry->error = error;
+ __entry->cmd = cmd;
+ __entry->type = request->fl_type;
+ __entry->start = request->fl_start;
+ __entry->end = request->fl_end;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ __entry->lockstateid_seq =
+ be32_to_cpu(lockstateid->seqid);
+ __entry->lockstateid_hash =
+ nfs_stateid_hash(lockstateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) cmd=%s:%s range=%lld:%lld "
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x lockstateid=%d:0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ show_lock_cmd(__entry->cmd),
+ show_lock_type(__entry->type),
+ (long long)__entry->start,
+ (long long)__entry->end,
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->lockstateid_seq, __entry->lockstateid_hash
+ )
+);
+
DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
TP_PROTO(
const struct inode *inode,
@@ -621,20 +759,28 @@ TRACE_EVENT(nfs4_delegreturn_exit,
__field(dev_t, dev)
__field(u32, fhandle)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
__entry->dev = res->server->s_dev;
__entry->fhandle = nfs_fhandle_hash(args->fhandle);
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(args->stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(args->stateid);
),
TP_printk(
- "error=%d (%s) dev=%02x:%02x fhandle=0x%08x",
+ "error=%d (%s) dev=%02x:%02x fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -653,6 +799,8 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
@@ -662,15 +810,21 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
- "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
- __entry->fhandle
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -820,7 +974,6 @@ DECLARE_EVENT_CLASS(nfs4_inode_event,
), \
TP_ARGS(inode, error))
-DEFINE_NFS4_INODE_EVENT(nfs4_setattr);
DEFINE_NFS4_INODE_EVENT(nfs4_access);
DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
@@ -830,8 +983,59 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
-DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation);
-DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn);
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
+ TP_PROTO(
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_event, name, \
+ TP_PROTO( \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(inode, stateid, error))
+
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
DECLARE_EVENT_CLASS(nfs4_getattr_event,
TP_PROTO(
@@ -941,8 +1145,74 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
), \
TP_ARGS(clp, fhandle, inode, error))
DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
-DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ const nfs4_stateid *stateid,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, stateid, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (inode != NULL) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ __entry->stateid_seq =
+ be32_to_cpu(stateid->seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(stateid);
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "stateid=%d:0x%08x dstaddr=%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ const nfs4_stateid *stateid, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, stateid, error))
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall);
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file);
DECLARE_EVENT_CLASS(nfs4_idmap_event,
TP_PROTO(
@@ -1005,28 +1275,37 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
__field(loff_t, offset)
__field(size_t, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
const struct inode *inode = hdr->inode;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->offset = hdr->args.offset;
__entry->count = hdr->args.count;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%zu",
+ "offset=%lld count=%zu stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset,
- __entry->count
+ __entry->count,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
#define DEFINE_NFS4_READ_EVENT(name) \
@@ -1056,28 +1335,37 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
__field(loff_t, offset)
__field(size_t, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
),
TP_fast_assign(
const struct inode *inode = hdr->inode;
+ const struct nfs4_state *state =
+ hdr->args.context->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
__entry->offset = hdr->args.offset;
__entry->count = hdr->args.count;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%zu",
+ "offset=%lld count=%zu stateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
(long long)__entry->offset,
- __entry->count
+ __entry->count,
+ __entry->stateid_seq, __entry->stateid_hash
)
);
@@ -1154,10 +1442,11 @@ TRACE_EVENT(nfs4_layoutget,
const struct nfs_open_context *ctx,
const struct pnfs_layout_range *args,
const struct pnfs_layout_range *res,
+ const nfs4_stateid *layout_stateid,
int error
),
- TP_ARGS(ctx, args, res, error),
+ TP_ARGS(ctx, args, res, layout_stateid, error),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -1167,10 +1456,15 @@ TRACE_EVENT(nfs4_layoutget,
__field(u64, offset)
__field(u64, count)
__field(int, error)
+ __field(int, stateid_seq)
+ __field(u32, stateid_hash)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
),
TP_fast_assign(
const struct inode *inode = d_inode(ctx->dentry);
+ const struct nfs4_state *state = ctx->state;
__entry->dev = inode->i_sb->s_dev;
__entry->fileid = NFS_FILEID(inode);
__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -1178,11 +1472,25 @@ TRACE_EVENT(nfs4_layoutget,
__entry->offset = args->offset;
__entry->count = args->length;
__entry->error = error;
+ __entry->stateid_seq =
+ be32_to_cpu(state->stateid.seqid);
+ __entry->stateid_hash =
+ nfs_stateid_hash(&state->stateid);
+ if (!error) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(layout_stateid->seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(layout_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
),
TP_printk(
"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "iomode=%s offset=%llu count=%llu",
+ "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
+ "layoutstateid=%d:0x%08x",
__entry->error,
show_nfsv4_errors(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1190,14 +1498,83 @@ TRACE_EVENT(nfs4_layoutget,
__entry->fhandle,
show_pnfs_iomode(__entry->iomode),
(unsigned long long)__entry->offset,
- (unsigned long long)__entry->count
+ (unsigned long long)__entry->count,
+ __entry->stateid_seq, __entry->stateid_hash,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash
)
);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
+#define show_pnfs_update_layout_reason(reason) \
+ __print_symbolic(reason, \
+ { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \
+ { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" }, \
+ { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" }, \
+ { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" }, \
+ { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" }, \
+ { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" }, \
+ { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" }, \
+ { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \
+ { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \
+ { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \
+ { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
+
+TRACE_EVENT(pnfs_update_layout,
+ TP_PROTO(struct inode *inode,
+ loff_t pos,
+ u64 count,
+ enum pnfs_iomode iomode,
+ struct pnfs_layout_hdr *lo,
+ enum pnfs_update_layout_reason reason
+ ),
+ TP_ARGS(inode, pos, count, iomode, lo, reason),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, fileid)
+ __field(u32, fhandle)
+ __field(loff_t, pos)
+ __field(u64, count)
+ __field(enum pnfs_iomode, iomode)
+ __field(int, layoutstateid_seq)
+ __field(u32, layoutstateid_hash)
+ __field(enum pnfs_update_layout_reason, reason)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+ __entry->pos = pos;
+ __entry->count = count;
+ __entry->iomode = iomode;
+ __entry->reason = reason;
+ if (lo != NULL) {
+ __entry->layoutstateid_seq =
+ be32_to_cpu(lo->plh_stateid.seqid);
+ __entry->layoutstateid_hash =
+ nfs_stateid_hash(&lo->plh_stateid);
+ } else {
+ __entry->layoutstateid_seq = 0;
+ __entry->layoutstateid_hash = 0;
+ }
+ ),
+ TP_printk(
+ "fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "iomode=%s pos=%llu count=%llu "
+ "layoutstateid=%d:0x%08x (%s)",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ show_pnfs_iomode(__entry->iomode),
+ (unsigned long long)__entry->pos,
+ (unsigned long long)__entry->count,
+ __entry->layoutstateid_seq, __entry->layoutstateid_hash,
+ show_pnfs_update_layout_reason(__entry->reason)
+ )
+);
+
#endif /* CONFIG_NFS_V4_1 */
#endif /* _TRACE_NFS4_H */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 788adf3897c7..88474a4fc669 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1659,7 +1659,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
*p = cpu_to_be32(FATTR4_WORD0_ACL);
p = reserve_space(xdr, 4);
*p = cpu_to_be32(arg->acl_len);
- xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+ xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len);
}
static void
@@ -2491,7 +2491,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
- args->acl_pages, args->acl_pgbase, args->acl_len);
+ args->acl_pages, 0, args->acl_len);
encode_nops(&hdr);
}
@@ -3615,6 +3615,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
status = 0;
if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
goto out;
+ bitmap[0] &= ~FATTR4_WORD0_FS_LOCATIONS;
status = -EIO;
/* Ignore borken servers that return unrequested attrs */
if (unlikely(res == NULL))
@@ -4375,6 +4376,11 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
goto xdr_error;
if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0)
goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0)
goto xdr_error;
if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0)
@@ -4574,6 +4580,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
status = decode_attr_mode(xdr, bitmap, &fmode);
if (status < 0)
goto xdr_error;
@@ -4627,6 +4637,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
if (status < 0)
goto xdr_error;
@@ -4764,6 +4778,28 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
return 0;
}
+/*
+ * The granularity of a CLONE operation.
+ */
+static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p)) {
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+ }
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
+ }
+ return 0;
+}
+
static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
{
unsigned int savep;
@@ -4789,15 +4825,28 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
goto xdr_error;
fsinfo->wtpref = fsinfo->wtmax;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
if (status != 0)
goto xdr_error;
status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
if (status != 0)
goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
if (status)
goto xdr_error;
+ status = decode_attr_clone_blksize(xdr, bitmap, &fsinfo->clone_blksize);
+ if (status)
+ goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
@@ -4952,7 +5001,7 @@ static int decode_space_limit(struct xdr_stream *xdr,
blocksize = be32_to_cpup(p);
maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
}
- maxsize >>= PAGE_CACHE_SHIFT;
+ maxsize >>= PAGE_SHIFT;
*pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
return 0;
out_overflow:
@@ -7465,6 +7514,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(ALLOCATE, enc_allocate, dec_allocate),
PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
+ PROC(CLONE, enc_clone, dec_clone),
#endif /* CONFIG_NFS_V4_2 */
};
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 9bc9f04fb7f6..89a15dbe5efc 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -90,7 +90,7 @@
#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
/* Parameters passed from the kernel command line */
-static char nfs_root_parms[256] __initdata = "";
+static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
/* Text-based mount options passed to super.c */
static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 59f838cdc009..9f80a086b612 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -39,7 +39,6 @@
{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
- { 1 << NFS_INO_COMMIT, "COMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5c0c6b58157f..049c1b1f2932 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -476,10 +476,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
}
unlock_page(page);
}
- if (PageDirty(page) || PageWriteback(page))
- *uptodate = true;
- else
- *uptodate = PageUptodate(page);
+ *uptodate = PageUptodate(page);
dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate);
return page;
}
@@ -489,7 +486,7 @@ static void __r4w_put_page(void *priv, struct page *page)
dprintk("%s: index=0x%lx\n", __func__,
(page == ZERO_PAGE(0)) ? -1UL : page->index);
if (ZERO_PAGE(0) != page)
- page_cache_release(page);
+ put_page(page);
return;
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index fe3ddd20ff89..1f6db4231057 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -101,53 +101,18 @@ nfs_page_free(struct nfs_page *p)
kmem_cache_free(nfs_page_cachep, p);
}
-static void
-nfs_iocounter_inc(struct nfs_io_counter *c)
-{
- atomic_inc(&c->io_count);
-}
-
-static void
-nfs_iocounter_dec(struct nfs_io_counter *c)
-{
- if (atomic_dec_and_test(&c->io_count)) {
- clear_bit(NFS_IO_INPROGRESS, &c->flags);
- smp_mb__after_atomic();
- wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
- }
-}
-
-static int
-__nfs_iocounter_wait(struct nfs_io_counter *c)
-{
- wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS);
- DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS);
- int ret = 0;
-
- do {
- prepare_to_wait(wq, &q.wait, TASK_KILLABLE);
- set_bit(NFS_IO_INPROGRESS, &c->flags);
- if (atomic_read(&c->io_count) == 0)
- break;
- ret = nfs_wait_bit_killable(&q.key);
- } while (atomic_read(&c->io_count) != 0 && !ret);
- finish_wait(wq, &q.wait);
- return ret;
-}
-
/**
* nfs_iocounter_wait - wait for i/o to complete
- * @c: nfs_io_counter to use
+ * @l_ctx: nfs_lock_context with io_counter to use
*
* returns -ERESTARTSYS if interrupted by a fatal signal.
* Otherwise returns 0 once the io_count hits 0.
*/
int
-nfs_iocounter_wait(struct nfs_io_counter *c)
+nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
{
- if (atomic_read(&c->io_count) == 0)
- return 0;
- return __nfs_iocounter_wait(c);
+ return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable,
+ TASK_KILLABLE);
}
/*
@@ -370,14 +335,14 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
return ERR_CAST(l_ctx);
}
req->wb_lock_context = l_ctx;
- nfs_iocounter_inc(&l_ctx->io_count);
+ atomic_inc(&l_ctx->io_count);
/* Initialize the request struct. Initially, we assume a
* long write-back delay. This will be adjusted in
* update_nfs_request below if the region is not locked. */
req->wb_page = page;
req->wb_index = page_file_index(page);
- page_cache_get(page);
+ get_page(page);
req->wb_offset = offset;
req->wb_pgbase = offset;
req->wb_bytes = count;
@@ -427,11 +392,12 @@ static void nfs_clear_request(struct nfs_page *req)
struct nfs_lock_context *l_ctx = req->wb_lock_context;
if (page != NULL) {
- page_cache_release(page);
+ put_page(page);
req->wb_page = NULL;
}
if (l_ctx != NULL) {
- nfs_iocounter_dec(&l_ctx->io_count);
+ if (atomic_dec_and_test(&l_ctx->io_count))
+ wake_up_atomic_t(&l_ctx->io_count);
nfs_put_lock_context(l_ctx);
req->wb_lock_context = NULL;
}
@@ -664,22 +630,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
* @desc: IO descriptor
* @hdr: pageio header
*/
-static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
+static void nfs_pgio_error(struct nfs_pgio_header *hdr)
{
- struct nfs_pgio_mirror *mirror;
- u32 midx;
-
set_bit(NFS_IOHDR_REDO, &hdr->flags);
nfs_pgio_data_destroy(hdr);
hdr->completion_ops->completion(hdr);
- /* TODO: Make sure it's right to clean up all mirrors here
- * and not just hdr->pgio_mirror_idx */
- for (midx = 0; midx < desc->pg_mirror_count; midx++) {
- mirror = &desc->pg_mirrors[midx];
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- }
- return -ENOMEM;
}
/**
@@ -800,8 +755,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
unsigned int pagecount, pageused;
pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
- if (!nfs_pgarray_set(&hdr->page_array, pagecount))
- return nfs_pgio_error(desc, hdr);
+ if (!nfs_pgarray_set(&hdr->page_array, pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
+ }
nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
pages = hdr->page_array.pagevec;
@@ -819,8 +777,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
*pages++ = last_page = req->wb_page;
}
}
- if (WARN_ON_ONCE(pageused != pagecount))
- return nfs_pgio_error(desc, hdr);
+ if (WARN_ON_ONCE(pageused != pagecount)) {
+ nfs_pgio_error(hdr);
+ desc->pg_error = -EINVAL;
+ return desc->pg_error;
+ }
if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
(desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
@@ -835,18 +796,13 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror;
struct nfs_pgio_header *hdr;
int ret;
- mirror = nfs_pgio_current_mirror(desc);
-
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- /* TODO: make sure this is right with mirroring - or
- * should it back out all mirrors? */
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
ret = nfs_generic_pgio(desc, hdr);
@@ -874,6 +830,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+ if (pgio->pg_error < 0)
+ return pgio->pg_error;
+
if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
return -EINVAL;
@@ -903,12 +862,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
pgio->pg_mirrors_dynamic = NULL;
}
-static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
- const struct nfs_open_context *ctx2)
-{
- return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
-}
-
static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
const struct nfs_lock_context *l2)
{
@@ -951,7 +904,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
return false;
} else {
if (req->wb_pgbase != 0 ||
- prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+ prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE)
return false;
}
}
@@ -982,6 +935,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
} else {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
+ if (desc->pg_error < 0)
+ return 0;
mirror->pg_base = req->wb_pgbase;
}
if (!nfs_can_coalesce_requests(prev, req, desc))
@@ -1147,6 +1102,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
bytes = req->wb_bytes;
nfs_pageio_setup_mirroring(desc, req);
+ if (desc->pg_error < 0)
+ goto out_failed;
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
if (midx) {
@@ -1163,7 +1120,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (IS_ERR(dupreq)) {
nfs_page_group_unlock(req);
- return 0;
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
}
nfs_lock_request(dupreq);
@@ -1176,10 +1134,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (nfs_pgio_has_mirroring(desc))
desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
- return 0;
+ goto out_failed;
}
return 1;
+
+out_failed:
+ /*
+ * We might have failed before sending any reqs over wire.
+ * Clean up rest of the reqs in mirror pg_list.
+ */
+ if (desc->pg_error) {
+ struct nfs_pgio_mirror *mirror;
+ void (*func)(struct list_head *);
+
+ /* remember fatal errors */
+ if (nfs_error_is_fatal(desc->pg_error))
+ mapping_set_error(desc->pg_inode->i_mapping,
+ desc->pg_error);
+
+ func = desc->pg_completion_ops->error_cleanup;
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = &desc->pg_mirrors[midx];
+ func(&mirror->pg_list);
+ }
+ }
+ return 0;
}
/*
@@ -1232,7 +1212,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
nfs_pageio_complete(desc);
if (!list_empty(&failed)) {
list_move(&failed, &hdr->pages);
- return -EIO;
+ return desc->pg_error < 0 ? desc->pg_error : -EIO;
}
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8abe27165ad0..89a5ef4df08a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -52,9 +52,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
*/
static LIST_HEAD(pnfs_modules_tbl);
-static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
- enum pnfs_iomode iomode, bool sync);
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
/* Return the registered pnfs layout driver module matching given id */
static struct pnfs_layoutdriver_type *
@@ -243,6 +241,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct inode *inode = lo->plh_inode;
+ pnfs_layoutreturn_before_put_layout_hdr(lo);
+
if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
if (!list_empty(&lo->plh_segs))
WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
@@ -252,6 +252,27 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
}
}
+/*
+ * Mark a pnfs_layout_hdr and all associated layout segments as invalid
+ *
+ * In order to continue using the pnfs_layout_hdr, a full recovery
+ * is required.
+ * Note that caller must hold inode->i_lock.
+ */
+static int
+pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list)
+{
+ struct pnfs_layout_range range = {
+ .iomode = IOMODE_ANY,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range);
+}
+
static int
pnfs_iomode_to_fail_bit(u32 iomode)
{
@@ -345,58 +366,6 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
}
-/* Return true if layoutreturn is needed */
-static bool
-pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
- struct pnfs_layout_segment *lseg)
-{
- struct pnfs_layout_segment *s;
-
- if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
- return false;
-
- list_for_each_entry(s, &lo->plh_segs, pls_list)
- if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
- return false;
-
- return true;
-}
-
-static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
-{
- if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
- return false;
- lo->plh_return_iomode = 0;
- pnfs_get_layout_hdr(lo);
- clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
- return true;
-}
-
-static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
- struct pnfs_layout_hdr *lo, struct inode *inode)
-{
- lo = lseg->pls_layout;
- inode = lo->plh_inode;
-
- spin_lock(&inode->i_lock);
- if (pnfs_layout_need_return(lo, lseg)) {
- nfs4_stateid stateid;
- enum pnfs_iomode iomode;
- bool send;
-
- stateid = lo->plh_stateid;
- iomode = lo->plh_return_iomode;
- send = pnfs_prepare_layoutreturn(lo);
- spin_unlock(&inode->i_lock);
- if (send) {
- /* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, stateid, iomode, false);
- }
- } else
- spin_unlock(&inode->i_lock);
-}
-
void
pnfs_put_lseg(struct pnfs_layout_segment *lseg)
{
@@ -410,15 +379,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
atomic_read(&lseg->pls_refcount),
test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
- /* Handle the case where refcount != 1 */
- if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
- return;
-
lo = lseg->pls_layout;
inode = lo->plh_inode;
- /* Do we need a layoutreturn? */
- if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
- pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
@@ -566,10 +528,10 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
int
pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *recall_range)
+ const struct pnfs_layout_range *recall_range)
{
struct pnfs_layout_segment *lseg, *next;
- int invalid = 0, removed = 0;
+ int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
@@ -582,11 +544,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
lseg->pls_range.length);
- invalid++;
- removed += mark_lseg_invalid(lseg, tmp_list);
+ if (!mark_lseg_invalid(lseg, tmp_list))
+ remaining++;
}
- dprintk("%s:Return %i\n", __func__, invalid - removed);
- return invalid - removed;
+ dprintk("%s:Return %i\n", __func__, remaining);
+ return remaining;
}
/* note free_me must contain lsegs from a single layout_hdr */
@@ -613,12 +575,10 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
spin_lock(&nfsi->vfs_inode.i_lock);
lo = nfsi->layout;
if (lo) {
- lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
- pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
pnfs_get_layout_hdr(lo);
+ pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
- pnfs_clear_retry_layoutget(lo);
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
@@ -677,11 +637,6 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
{
struct pnfs_layout_hdr *lo;
struct inode *inode;
- struct pnfs_layout_range range = {
- .iomode = IOMODE_ANY,
- .offset = 0,
- .length = NFS4_MAX_UINT64,
- };
LIST_HEAD(lseg_list);
int ret = 0;
@@ -696,13 +651,15 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
spin_lock(&inode->i_lock);
list_del_init(&lo->plh_bulk_destroy);
- lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
- if (is_bulk_recall)
- set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
- if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
+ if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
+ if (is_bulk_recall)
+ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
ret = -EAGAIN;
+ }
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&lseg_list);
+ /* Free all lsegs that are attached to commit buckets */
+ nfs_commit_inode(inode, 0);
pnfs_put_layout_hdr(lo);
iput(inode);
}
@@ -826,7 +783,7 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
int
pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
struct nfs4_state *open_state)
{
int status = 0;
@@ -861,7 +818,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
static struct pnfs_layout_segment *
send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_open_context *ctx,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
gfp_t gfp_flags)
{
struct inode *ino = lo->plh_inode;
@@ -872,44 +829,41 @@ send_layoutget(struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
- lgp = kzalloc(sizeof(*lgp), gfp_flags);
- if (lgp == NULL)
- return NULL;
-
- i_size = i_size_read(ino);
-
- lgp->args.minlength = PAGE_CACHE_SIZE;
- if (lgp->args.minlength > range->length)
- lgp->args.minlength = range->length;
- if (range->iomode == IOMODE_READ) {
- if (range->offset >= i_size)
- lgp->args.minlength = 0;
- else if (i_size - range->offset < lgp->args.minlength)
- lgp->args.minlength = i_size - range->offset;
- }
- lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
- lgp->args.range = *range;
- lgp->args.type = server->pnfs_curr_ld->id;
- lgp->args.inode = ino;
- lgp->args.ctx = get_nfs_open_context(ctx);
- lgp->gfp_flags = gfp_flags;
- lgp->cred = lo->plh_lc_cred;
-
- /* Synchronously retrieve layout information from server and
- * store in lseg.
+ /*
+ * Synchronously retrieve layout information from server and
+ * store in lseg. If we race with a concurrent seqid morphing
+ * op, then re-send the LAYOUTGET.
*/
- lseg = nfs4_proc_layoutget(lgp, gfp_flags);
- if (IS_ERR(lseg)) {
- switch (PTR_ERR(lseg)) {
- case -ENOMEM:
- case -ERESTARTSYS:
- break;
- default:
- /* remember that LAYOUTGET failed and suspend trying */
- pnfs_layout_io_set_failed(lo, range->iomode);
+ do {
+ lgp = kzalloc(sizeof(*lgp), gfp_flags);
+ if (lgp == NULL)
+ return NULL;
+
+ i_size = i_size_read(ino);
+
+ lgp->args.minlength = PAGE_SIZE;
+ if (lgp->args.minlength > range->length)
+ lgp->args.minlength = range->length;
+ if (range->iomode == IOMODE_READ) {
+ if (range->offset >= i_size)
+ lgp->args.minlength = 0;
+ else if (i_size - range->offset < lgp->args.minlength)
+ lgp->args.minlength = i_size - range->offset;
}
- return NULL;
- } else
+ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+ pnfs_copy_range(&lgp->args.range, range);
+ lgp->args.type = server->pnfs_curr_ld->id;
+ lgp->args.inode = ino;
+ lgp->args.ctx = get_nfs_open_context(ctx);
+ lgp->gfp_flags = gfp_flags;
+ lgp->cred = lo->plh_lc_cred;
+
+ lseg = nfs4_proc_layoutget(lgp, gfp_flags);
+ } while (lseg == ERR_PTR(-EAGAIN));
+
+ if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
+ lseg = NULL;
+ else
pnfs_layout_clear_fail_bit(lo,
pnfs_iomode_to_fail_bit(range->iomode));
@@ -939,8 +893,19 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
}
+static bool
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+{
+ if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ return false;
+ lo->plh_return_iomode = 0;
+ pnfs_get_layout_hdr(lo);
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ return true;
+}
+
static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
enum pnfs_iomode iomode, bool sync)
{
struct inode *ino = lo->plh_inode;
@@ -957,7 +922,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
goto out;
}
- lrp->args.stateid = stateid;
+ nfs4_stateid_copy(&lrp->args.stateid, stateid);
lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
lrp->args.inode = ino;
lrp->args.range.iomode = iomode;
@@ -973,6 +938,48 @@ out:
return status;
}
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layout_segment *s;
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ return false;
+
+ /* Defer layoutreturn until all lsegs are done */
+ list_for_each_entry(s, &lo->plh_segs, pls_list) {
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
+ return false;
+ }
+
+ return true;
+}
+
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode= lo->plh_inode;
+
+ if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ return;
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_need_return(lo)) {
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+ bool send;
+
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ iomode = lo->plh_return_iomode;
+ send = pnfs_prepare_layoutreturn(lo);
+ spin_unlock(&inode->i_lock);
+ if (send) {
+ /* Send an async layoutreturn so we dont deadlock */
+ pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ }
+ } else
+ spin_unlock(&inode->i_lock);
+}
+
/*
* Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
* when the layout segment list is empty.
@@ -1000,7 +1007,7 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout to return\n", __func__);
goto out;
}
- stateid = nfsi->layout->plh_stateid;
+ nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
/* Reference matched in nfs4_layoutreturn_release */
pnfs_get_layout_hdr(lo);
empty = list_empty(&lo->plh_segs);
@@ -1028,7 +1035,7 @@ _pnfs_return_layout(struct inode *ino)
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
if (send)
- status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+ status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
out_put_layout_hdr:
pnfs_put_layout_hdr(lo);
out:
@@ -1091,13 +1098,12 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc;
}
- stateid = lo->plh_stateid;
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
/* always send layoutreturn if being marked so */
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
&lo->plh_flags))
layoutreturn = pnfs_prepare_layoutreturn(lo);
- pnfs_clear_retry_layoutget(lo);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
/* If we are sending layoutreturn, invalidate all valid lsegs */
if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
@@ -1119,7 +1125,7 @@ out_noroc:
pnfs_free_lseg_list(&tmp_list);
pnfs_layoutcommit_inode(ino, true);
if (layoutreturn)
- pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+ pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
return roc;
}
@@ -1144,6 +1150,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout;
+ pnfs_mark_layout_returned_if_empty(lo);
if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
lo->plh_barrier = barrier;
spin_unlock(&ino->i_lock);
@@ -1460,25 +1467,15 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
return ret;
}
-/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
-static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
-{
- if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
- return 1;
- return nfs_wait_bit_killable(key);
-}
-
static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
{
- if (!pnfs_should_retry_layoutget(lo))
- return false;
/*
* send layoutcommit as it can hold up layoutreturn due to lseg
* reference
*/
pnfs_layoutcommit_inode(lo->plh_inode, false);
return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
- pnfs_layoutget_retry_bit_wait,
+ nfs_wait_bit_killable,
TASK_UNINTERRUPTIBLE);
}
@@ -1515,14 +1512,23 @@ pnfs_update_layout(struct inode *ino,
struct pnfs_layout_segment *lseg = NULL;
bool first;
- if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+ if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_NO_PNFS);
goto out;
+ }
- if (iomode == IOMODE_READ && i_size_read(ino) == 0)
+ if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
goto out;
+ }
- if (pnfs_within_mdsthreshold(ctx, ino, iomode))
+ if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_MDSTHRESH);
goto out;
+ }
lookup_again:
first = false;
@@ -1530,19 +1536,25 @@ lookup_again:
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
spin_unlock(&ino->i_lock);
+ trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+ PNFS_UPDATE_LAYOUT_NOMEM);
goto out;
}
/* Do we even need to bother with this? */
if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_BULK_RECALL);
dprintk("%s matches recall, use MDS\n", __func__);
goto out_unlock;
}
/* if LAYOUTGET already failed once we don't try again */
- if (pnfs_layout_io_test_failed(lo, iomode) &&
- !pnfs_should_retry_layoutget(lo))
+ if (pnfs_layout_io_test_failed(lo, iomode)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
goto out_unlock;
+ }
first = list_empty(&lo->plh_segs);
if (first) {
@@ -1562,8 +1574,11 @@ lookup_again:
* already exists
*/
lseg = pnfs_find_lseg(lo, &arg);
- if (lseg)
+ if (lseg) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_FOUND_CACHED);
goto out_unlock;
+ }
}
/*
@@ -1580,11 +1595,16 @@ lookup_again:
dprintk("%s retrying\n", __func__);
goto lookup_again;
}
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_RETURN);
goto out_put_layout_hdr;
}
- if (pnfs_layoutgets_blocked(lo))
+ if (pnfs_layoutgets_blocked(lo)) {
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_BLOCKED);
goto out_unlock;
+ }
atomic_inc(&lo->plh_outstanding);
spin_unlock(&ino->i_lock);
@@ -1598,17 +1618,18 @@ lookup_again:
spin_unlock(&clp->cl_lock);
}
- pg_offset = arg.offset & ~PAGE_CACHE_MASK;
+ pg_offset = arg.offset & ~PAGE_MASK;
if (pg_offset) {
arg.offset -= pg_offset;
arg.length += pg_offset;
}
if (arg.length != NFS4_MAX_UINT64)
- arg.length = PAGE_CACHE_ALIGN(arg.length);
+ arg.length = PAGE_ALIGN(arg.length);
lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
- pnfs_clear_retry_layoutget(lo);
atomic_dec(&lo->plh_outstanding);
+ trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
out_put_layout_hdr:
if (first)
pnfs_clear_first_layoutget(lo);
@@ -1618,7 +1639,7 @@ out:
"(%s, offset: %llu, length: %llu)\n",
__func__, ino->i_sb->s_id,
(unsigned long long)NFS_FILEID(ino),
- lseg == NULL ? "not found" : "found",
+ IS_ERR_OR_NULL(lseg) ? "not found" : "found",
iomode==IOMODE_RW ? "read/write" : "read-only",
(unsigned long long)pos,
(unsigned long long)count);
@@ -1687,6 +1708,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
dprintk("%s forget reply due to sequence\n", __func__);
+ status = -EAGAIN;
goto out_forget_reply;
}
pnfs_set_layout_stateid(lo, &res->stateid, false);
@@ -1724,16 +1746,40 @@ out_forget_reply:
}
static void
+pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
+{
+ if (lo->plh_return_iomode == iomode)
+ return;
+ if (lo->plh_return_iomode != 0)
+ iomode = IOMODE_ANY;
+ lo->plh_return_iomode = iomode;
+ set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
+/**
+ * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
+ * @lo: pointer to layout header
+ * @tmp_list: list header to be used with pnfs_free_lseg_list()
+ * @return_range: describe layout segment ranges to be returned
+ *
+ * This function is mainly intended for use by layoutrecall. It attempts
+ * to free the layout segment immediately, or else to mark it for return
+ * as soon as its reference count drops to zero.
+ */
+int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *return_range)
+ const struct pnfs_layout_range *return_range)
{
struct pnfs_layout_segment *lseg, *next;
+ int remaining = 0;
dprintk("%s:Begin lo %p\n", __func__, lo);
if (list_empty(&lo->plh_segs))
- return;
+ return 0;
+
+ assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
if (should_free_lseg(&lseg->pls_range, return_range)) {
@@ -1742,39 +1788,47 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg, lseg->pls_range.iomode,
lseg->pls_range.offset,
lseg->pls_range.length);
+ if (mark_lseg_invalid(lseg, tmp_list))
+ continue;
+ remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
- mark_lseg_invalid(lseg, tmp_list);
- set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags);
+ pnfs_set_plh_return_iomode(lo, return_range->iomode);
}
+ return remaining;
}
void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg)
{
struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
- int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
struct pnfs_layout_range range = {
.iomode = lseg->pls_range.iomode,
.offset = 0,
.length = NFS4_MAX_UINT64,
};
LIST_HEAD(free_me);
+ bool return_now = false;
spin_lock(&inode->i_lock);
- /* set failure bit so that pnfs path will be retried later */
- pnfs_layout_set_fail_bit(lo, iomode);
- if (lo->plh_return_iomode == 0)
- lo->plh_return_iomode = range.iomode;
- else if (lo->plh_return_iomode != range.iomode)
- lo->plh_return_iomode = IOMODE_ANY;
+ pnfs_set_plh_return_iomode(lo, range.iomode);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
- spin_unlock(&inode->i_lock);
+ if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode = lo->plh_return_iomode;
+
+ nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+ return_now = pnfs_prepare_layoutreturn(lo);
+ spin_unlock(&inode->i_lock);
+ if (return_now)
+ pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ } else {
+ spin_unlock(&inode->i_lock);
+ nfs_commit_inode(inode, 0);
+ }
pnfs_free_lseg_list(&free_me);
}
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
@@ -1796,6 +1850,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
rd_size,
IOMODE_READ,
GFP_KERNEL);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
}
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
@@ -1808,13 +1867,19 @@ void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size)
{
- if (pgio->pg_lseg == NULL)
+ if (pgio->pg_lseg == NULL) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
req_offset(req),
wb_size,
IOMODE_RW,
GFP_NOFS);
+ if (IS_ERR(pgio->pg_lseg)) {
+ pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ return;
+ }
+ }
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_write_mds(pgio);
@@ -1912,12 +1977,13 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
{
- trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
- if (!hdr->pnfs_error) {
+ if (likely(!hdr->pnfs_error)) {
pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
hdr->mds_offset + hdr->res.count);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- } else
+ }
+ trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_write_error(hdr);
hdr->mds_ops->rpc_release(hdr);
}
@@ -1981,15 +2047,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
@@ -2028,11 +2092,12 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
{
- trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
if (likely(!hdr->pnfs_error)) {
__nfs4_read_done_cb(hdr);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- } else
+ }
+ trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_read_error(hdr);
hdr->mds_ops->rpc_release(hdr);
}
@@ -2111,15 +2176,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
- return -ENOMEM;
+ desc->pg_error = -ENOMEM;
+ return desc->pg_error;
}
nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d1990e90e7a0..1ac1db5f6dad 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,11 +94,10 @@ enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
- NFS_LAYOUT_RETURN, /* Return this layout ASAP */
- NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
+ NFS_LAYOUT_RETURN, /* layoutreturn in progress */
+ NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
- NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */
};
enum layoutdriver_policy_flags {
@@ -261,11 +260,14 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
bool update_barrier);
int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range,
+ const struct pnfs_layout_range *range,
struct nfs4_state *open_state);
int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
- struct pnfs_layout_range *recall_range);
+ const struct pnfs_layout_range *recall_range);
+int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ const struct pnfs_layout_range *recall_range);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -379,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
return d;
}
-static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
- atomic_inc(&lo->plh_refcount);
-}
-
-static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
- atomic_dec(&lo->plh_refcount);
- /* wake up waiters for LAYOUTRETURN as that is not needed */
- wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
- }
-}
-
-static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
- return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
-}
-
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
@@ -409,6 +391,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg)
return lseg;
}
+static inline bool
+pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg)
+{
+ return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0;
+}
+
/* Return true if a layout driver is being used for this mountpoint */
static inline int pnfs_enabled_sb(struct nfs_server *nfss)
{
@@ -556,6 +544,26 @@ pnfs_calc_offset_length(u64 offset, u64 end)
return 1 + end - offset;
}
+/**
+ * pnfs_mark_layout_returned_if_empty - marks the layout as returned
+ * @lo: layout header
+ *
+ * Note: Caller must hold inode->i_lock
+ */
+static inline void
+pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
+{
+ if (list_empty(&lo->plh_segs))
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+}
+
+static inline void
+pnfs_copy_range(struct pnfs_layout_range *dst,
+ const struct pnfs_layout_range *src)
+{
+ memcpy(dst, src, sizeof(*dst));
+}
+
extern unsigned int layoutstats_timer;
#ifdef NFS_DEBUG
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 24655b807d44..4aaed890048f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -266,17 +266,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
} else {
nfs_retry_commit(mds_pages, NULL, cinfo, 0);
pnfs_generic_retry_commit(cinfo, 0);
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
return -ENOMEM;
}
}
nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
- if (nreq == 0) {
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
+ if (nreq == 0)
goto out;
- }
atomic_add(nreq, &cinfo->mds->rpcs_out);
@@ -609,12 +606,22 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
dprintk("%s: DS %s: trying address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
- clp = get_v3_ds_connect(mds_srv->nfs_client,
+ if (!IS_ERR(clp)) {
+ struct xprt_create xprt_args = {
+ .ident = XPRT_TRANSPORT_TCP,
+ .net = clp->cl_net,
+ .dstaddr = (struct sockaddr *)&da->da_addr,
+ .addrlen = da->da_addrlen,
+ .servername = clp->cl_hostname,
+ };
+ /* Add this address as an alias */
+ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+ rpc_clnt_test_and_add_xprt, NULL);
+ } else
+ clp = get_v3_ds_connect(mds_srv->nfs_client,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
timeo, retrans, au_flavor);
- if (!IS_ERR(clp))
- break;
}
if (IS_ERR(clp)) {
@@ -871,6 +878,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
buckets = cinfo->ds->buckets;
list = &buckets[ds_commit_idx].written;
if (list_empty(list)) {
+ if (!pnfs_is_valid_lseg(lseg)) {
+ spin_unlock(cinfo->lock);
+ cinfo->completion_ops->resched_write(cinfo, req);
+ return;
+ }
/* Non-empty buckets hold a reference on the lseg. That ref
* is normally transferred to the COMMIT call and released
* there. It could also be released if the last req is pulled
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 01b8cc8e8cfc..6776d7a7839e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -46,7 +46,7 @@ static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
static
int nfs_return_empty_page(struct page *page)
{
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
SetPageUptodate(page);
unlock_page(page);
return 0;
@@ -85,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
+static void nfs_readpage_release(struct nfs_page *req)
+{
+ struct inode *inode = d_inode(req->wb_context->dentry);
+
+ dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
+ (long long)req_offset(req));
+
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+ if (PageUptodate(req->wb_page))
+ nfs_readpage_to_fscache(inode, req->wb_page, 0);
+
+ unlock_page(req->wb_page);
+ }
+ nfs_release_request(req);
+}
+
int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
struct page *page)
{
@@ -101,12 +118,15 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
unlock_page(page);
return PTR_ERR(new);
}
- if (len < PAGE_CACHE_SIZE)
- zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
nfs_pageio_init_read(&pgio, inode, false,
&nfs_async_read_completion_ops);
- nfs_pageio_add_request(&pgio, new);
+ if (!nfs_pageio_add_request(&pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new);
+ }
nfs_pageio_complete(&pgio);
/* It doesn't make sense to do mirrored reads! */
@@ -115,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
pgm = &pgio.pg_mirrors[0];
NFS_I(inode)->read_io += pgm->pg_bytes_written;
- return 0;
-}
-
-static void nfs_readpage_release(struct nfs_page *req)
-{
- struct inode *inode = d_inode(req->wb_context->dentry);
-
- dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
- (unsigned long long)NFS_FILEID(inode), req->wb_bytes,
- (long long)req_offset(req));
-
- if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
- if (PageUptodate(req->wb_page))
- nfs_readpage_to_fscache(inode, req->wb_page, 0);
-
- unlock_page(req->wb_page);
- }
- nfs_release_request(req);
+ return pgio.pg_error < 0 ? pgio.pg_error : 0;
}
static void nfs_page_group_set_uptodate(struct nfs_page *req)
@@ -246,6 +249,13 @@ static void nfs_readpage_retry(struct rpc_task *task,
nfs_set_pgio_error(hdr, -EIO, argp->offset);
return;
}
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
/* Yes, so retry the read at the end of the hdr */
hdr->mds_offset += resp->count;
argp->offset += resp->count;
@@ -268,7 +278,7 @@ static void nfs_readpage_result(struct rpc_task *task,
hdr->good_bytes = bound - hdr->io_start;
}
spin_unlock(&hdr->lock);
- } else if (hdr->res.count != hdr->args.count)
+ } else if (hdr->res.count < hdr->args.count)
nfs_readpage_retry(task, hdr);
}
@@ -285,7 +295,7 @@ int nfs_readpage(struct file *file, struct page *page)
int error;
dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
- page, PAGE_CACHE_SIZE, page_file_index(page));
+ page, PAGE_SIZE, page_file_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
nfs_add_stats(inode, NFSIOS_READPAGES, 1);
@@ -351,9 +361,11 @@ readpage_async_filler(void *data, struct page *page)
if (IS_ERR(new))
goto out_error;
- if (len < PAGE_CACHE_SIZE)
- zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ if (len < PAGE_SIZE)
+ zero_user_segment(page, len, PAGE_SIZE);
if (!nfs_pageio_add_request(desc->pgio, new)) {
+ nfs_list_remove_request(new);
+ nfs_readpage_release(new);
error = desc->pgio->pg_error;
goto out_unlock;
}
@@ -412,8 +424,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
pgm = &pgio.pg_mirrors[0];
NFS_I(inode)->read_io += pgm->pg_bytes_written;
- npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
nfs_add_stats(inode, NFSIOS_READPAGES, npages);
read_complete:
put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 383a027de452..f1268280244e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2816,7 +2816,6 @@ out_invalid_transport_udp:
* NFS client for backwards compatibility
*/
unsigned int nfs_callback_set_tcpport;
-unsigned short nfs_callback_tcpport;
/* Default cache timeout is 10 minutes */
unsigned int nfs_idmap_cache_timeout = 600;
/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
@@ -2827,7 +2826,6 @@ char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
bool recover_lost_locks = false;
EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
-EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
EXPORT_SYMBOL_GPL(max_session_slots);
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index b6de433da5db..4fe3eead3868 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -42,21 +42,35 @@ error:
return -EIO;
}
-static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *nfs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct page *page;
void *err;
- err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
- if (err)
- return err;
- page = read_cache_page(&inode->i_data, 0,
- (filler_t *)nfs_symlink_filler, inode);
- if (IS_ERR(page))
- return ERR_CAST(page);
- *cookie = page;
- return kmap(page);
+ if (!dentry) {
+ err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
+ if (err)
+ return err;
+ page = find_get_page(inode->i_mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
+ if (err)
+ return err;
+ page = read_cache_page(&inode->i_data, 0,
+ (filler_t *)nfs_symlink_filler, inode);
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ }
+ set_delayed_call(done, page_put_link, page);
+ return page_address(page);
}
/*
@@ -64,8 +78,7 @@ static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
*/
const struct inode_operations nfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = nfs_follow_link,
- .put_link = page_put_link,
+ .get_link = nfs_get_link,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
};
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 75ab7622e0cc..5f4fd53e5764 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -21,6 +21,8 @@
#include <linux/nfs_page.h>
#include <linux/backing-dev.h>
#include <linux/export.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
#include <asm/uaccess.h>
@@ -148,7 +150,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
spin_lock(&inode->i_lock);
i_size = i_size_read(inode);
- end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (i_size - 1) >> PAGE_SHIFT;
if (i_size > 0 && page_file_index(page) < end_index)
goto out;
end = page_file_offset(page) + ((loff_t)offset+count);
@@ -244,11 +246,9 @@ static int wb_priority(struct writeback_control *wbc)
{
int ret = 0;
if (wbc->for_reclaim)
- return FLUSH_HIGHPRI | FLUSH_STABLE;
+ return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
if (wbc->sync_mode == WB_SYNC_ALL)
ret = FLUSH_COND_STABLE;
- if (wbc->for_kupdate || wbc->for_background)
- ret |= FLUSH_LOWPRI;
return ret;
}
@@ -545,12 +545,22 @@ try_again:
return head;
}
+static void nfs_write_error_remove_page(struct nfs_page *req)
+{
+ nfs_unlock_request(req);
+ nfs_end_page_writeback(req);
+ nfs_release_request(req);
+ generic_error_remove_page(page_file_mapping(req->wb_page),
+ req->wb_page);
+}
+
/*
* Find an associated nfs write request, and prepare to flush it out
* May return an error if the user signalled nfs_wait_on_request().
*/
static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page, bool nonblock)
+ struct page *page, bool nonblock,
+ bool launder)
{
struct nfs_page *req;
int ret = 0;
@@ -567,8 +577,21 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
ret = 0;
if (!nfs_pageio_add_request(pgio, req)) {
- nfs_redirty_request(req);
ret = pgio->pg_error;
+ /*
+ * Remove the problematic req upon fatal errors
+ * in launder case, while other dirty pages can
+ * still be around until they get flushed.
+ */
+ if (nfs_error_is_fatal(ret)) {
+ nfs_context_set_write_error(req->wb_context, ret);
+ if (launder) {
+ nfs_write_error_remove_page(req);
+ goto out;
+ }
+ }
+ nfs_redirty_request(req);
+ ret = -EAGAIN;
} else
nfs_add_stats(page_file_mapping(page)->host,
NFSIOS_WRITEPAGES, 1);
@@ -576,12 +599,14 @@ out:
return ret;
}
-static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio, bool launder)
{
int ret;
nfs_pageio_cond_complete(pgio, page_file_index(page));
- ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+ ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
+ launder);
if (ret == -EAGAIN) {
redirty_page_for_writepage(wbc, page);
ret = 0;
@@ -592,7 +617,9 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
/*
* Write an mmapped page to the server.
*/
-static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+static int nfs_writepage_locked(struct page *page,
+ struct writeback_control *wbc,
+ bool launder)
{
struct nfs_pageio_descriptor pgio;
struct inode *inode = page_file_mapping(page)->host;
@@ -601,7 +628,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
false, &nfs_async_write_completion_ops);
- err = nfs_do_writepage(page, wbc, &pgio);
+ err = nfs_do_writepage(page, wbc, &pgio, launder);
nfs_pageio_complete(&pgio);
if (err < 0)
return err;
@@ -614,7 +641,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
{
int ret;
- ret = nfs_writepage_locked(page, wbc);
+ ret = nfs_writepage_locked(page, wbc, false);
unlock_page(page);
return ret;
}
@@ -623,7 +650,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
{
int ret;
- ret = nfs_do_writepage(page, wbc, data);
+ ret = nfs_do_writepage(page, wbc, data, false);
unlock_page(page);
return ret;
}
@@ -803,11 +830,10 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
* holding the nfs_page lock.
*/
void
-nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
- struct nfs_commit_info *cinfo)
+nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
{
spin_lock(cinfo->lock);
- nfs_request_add_commit_list_locked(req, dst, cinfo);
+ nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
spin_unlock(cinfo->lock);
nfs_mark_page_unstable(req->wb_page, cinfo);
}
@@ -865,7 +891,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
{
if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
return;
- nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+ nfs_request_add_commit_list(req, cinfo);
}
static void
@@ -1128,7 +1154,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
if (req == NULL)
return 0;
l_ctx = req->wb_lock_context;
- do_flush = req->wb_page != page || req->wb_context != ctx;
+ do_flush = req->wb_page != page ||
+ !nfs_match_open_context(req->wb_context, ctx);
/* for now, flush if more than 1 request in page_group */
do_flush |= req->wb_this_page != req;
if (l_ctx && flctx &&
@@ -1326,9 +1353,15 @@ static void nfs_async_write_error(struct list_head *head)
}
}
+static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+ nfs_async_write_error(&hdr->pages);
+}
+
static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
.error_cleanup = nfs_async_write_error,
.completion = nfs_write_completion,
+ .reschedule_io = nfs_async_write_reschedule_io,
};
void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -1505,6 +1538,13 @@ static void nfs_writeback_result(struct rpc_task *task,
task->tk_status = -EIO;
return;
}
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
/* Was this an NFSv2 write or an NFSv3 stable write? */
if (resp->verf->committed != NFS_UNSTABLE) {
/* Resend from where the server left off */
@@ -1522,27 +1562,21 @@ static void nfs_writeback_result(struct rpc_task *task,
}
}
-
-static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
{
- int ret;
+ return wait_on_atomic_t(&cinfo->rpcs_out,
+ nfs_wait_atomic_killable, TASK_KILLABLE);
+}
- if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
- return 1;
- if (!may_wait)
- return 0;
- ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
- NFS_INO_COMMIT,
- nfs_wait_bit_killable,
- TASK_KILLABLE);
- return (ret < 0) ? ret : 1;
+static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+{
+ atomic_inc(&cinfo->rpcs_out);
}
-static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+static void nfs_commit_end(struct nfs_mds_commit_info *cinfo)
{
- clear_bit(NFS_INO_COMMIT, &nfsi->flags);
- smp_mb__after_atomic();
- wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+ if (atomic_dec_and_test(&cinfo->rpcs_out))
+ wake_up_atomic_t(&cinfo->rpcs_out);
}
void nfs_commitdata_release(struct nfs_commit_data *data)
@@ -1659,6 +1693,13 @@ void nfs_retry_commit(struct list_head *page_list,
}
EXPORT_SYMBOL_GPL(nfs_retry_commit);
+static void
+nfs_commit_resched_write(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
+{
+ __set_page_dirty_nobuffers(req->wb_page);
+}
+
/*
* Commit dirty pages
*/
@@ -1680,7 +1721,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
data->mds_ops, how, 0);
out_bad:
nfs_retry_commit(head, NULL, cinfo, 0);
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
return -ENOMEM;
}
@@ -1742,8 +1782,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
- if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
- nfs_commit_clear_lock(NFS_I(data->inode));
+ nfs_commit_end(cinfo.mds);
}
static void nfs_commit_release(void *calldata)
@@ -1762,7 +1801,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
.completion = nfs_commit_release_pages,
- .error_cleanup = nfs_commit_clear_lock,
+ .resched_write = nfs_commit_resched_write,
};
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
@@ -1781,30 +1820,25 @@ int nfs_commit_inode(struct inode *inode, int how)
LIST_HEAD(head);
struct nfs_commit_info cinfo;
int may_wait = how & FLUSH_SYNC;
+ int error = 0;
int res;
- res = nfs_commit_set_lock(NFS_I(inode), may_wait);
- if (res <= 0)
- goto out_mark_dirty;
nfs_init_cinfo_from_inode(&cinfo, inode);
+ nfs_commit_begin(cinfo.mds);
res = nfs_scan_commit(inode, &head, &cinfo);
- if (res) {
- int error;
-
+ if (res)
error = nfs_generic_commit_list(inode, &head, how, &cinfo);
- if (error < 0)
- return error;
- if (!may_wait)
- goto out_mark_dirty;
- error = wait_on_bit_action(&NFS_I(inode)->flags,
- NFS_INO_COMMIT,
- nfs_wait_bit_killable,
- TASK_KILLABLE);
- if (error < 0)
- return error;
- } else
- nfs_commit_clear_lock(NFS_I(inode));
+ nfs_commit_end(cinfo.mds);
+ if (error < 0)
+ goto out_error;
+ if (!may_wait)
+ goto out_mark_dirty;
+ error = wait_on_commit(cinfo.mds);
+ if (error < 0)
+ return error;
return res;
+out_error:
+ res = error;
/* Note: If we exit without ensuring that the commit is complete,
* we must mark the inode as dirty. Otherwise, future calls to
* sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
@@ -1814,6 +1848,7 @@ out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return res;
}
+EXPORT_SYMBOL_GPL(nfs_commit_inode);
int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
@@ -1904,10 +1939,10 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
/*
* Write back all requests on one page - we do this before reading it.
*/
-int nfs_wb_page(struct inode *inode, struct page *page)
+int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
{
loff_t range_start = page_file_offset(page);
- loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+ loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
@@ -1921,7 +1956,7 @@ int nfs_wb_page(struct inode *inode, struct page *page)
for (;;) {
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
- ret = nfs_writepage_locked(page, &wbc);
+ ret = nfs_writepage_locked(page, &wbc, launder);
if (ret < 0)
goto out_error;
continue;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index a0b77fc1bd39..c9f583d7bac8 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -84,12 +84,30 @@ config NFSD_V4
If unsure, say N.
config NFSD_PNFS
- bool "NFSv4.1 server support for Parallel NFS (pNFS)"
- depends on NFSD_V4
+ bool
+
+config NFSD_BLOCKLAYOUT
+ bool "NFSv4.1 server support for pNFS block layouts"
+ depends on NFSD_V4 && BLOCK
+ select NFSD_PNFS
+ help
+ This option enables support for the exporting pNFS block layouts
+ in the kernel's NFS server. The pNFS block layout enables NFS
+ clients to directly perform I/O to block devices accesible to both
+ the server and the clients. See RFC 5663 for more details.
+
+ If unsure, say N.
+
+config NFSD_SCSILAYOUT
+ bool "NFSv4.1 server support for pNFS SCSI layouts"
+ depends on NFSD_V4 && BLOCK
+ select NFSD_PNFS
help
- This option enables support for the parallel NFS features of the
- minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
- server.
+ This option enables support for the exporting pNFS SCSI layouts
+ in the kernel's NFS server. The pNFS SCSI layout enables NFS
+ clients to directly perform I/O to SCSI devices accesible to both
+ the server and the clients. See draft-ietf-nfsv4-scsi-layout for
+ more details.
If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9a6028e120c6..3ae5f3c77e28 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -17,4 +17,6 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
-nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
+nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index c29d9421bd5e..e55b5242614d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -1,11 +1,14 @@
/*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/exportfs.h>
#include <linux/genhd.h>
#include <linux/slab.h>
+#include <linux/pr.h>
#include <linux/nfsd/debug.h>
+#include <scsi/scsi_proto.h>
+#include <scsi/scsi_common.h>
#include "blocklayoutxdr.h"
#include "pnfs.h"
@@ -13,37 +16,6 @@
#define NFSDDBG_FACILITY NFSDDBG_PNFS
-static int
-nfsd4_block_get_device_info_simple(struct super_block *sb,
- struct nfsd4_getdeviceinfo *gdp)
-{
- struct pnfs_block_deviceaddr *dev;
- struct pnfs_block_volume *b;
-
- dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
- sizeof(struct pnfs_block_volume), GFP_KERNEL);
- if (!dev)
- return -ENOMEM;
- gdp->gd_device = dev;
-
- dev->nr_volumes = 1;
- b = &dev->volumes[0];
-
- b->type = PNFS_BLOCK_VOLUME_SIMPLE;
- b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
- return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
- &b->simple.offset);
-}
-
-static __be32
-nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
- struct nfsd4_getdeviceinfo *gdp)
-{
- if (sb->s_bdev != sb->s_bdev->bd_contains)
- return nfserr_inval;
- return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
-}
-
static __be32
nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
struct nfsd4_layoutget *args)
@@ -141,20 +113,13 @@ out_layoutunavailable:
}
static __be32
-nfsd4_block_proc_layoutcommit(struct inode *inode,
- struct nfsd4_layoutcommit *lcp)
+nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp,
+ struct iomap *iomaps, int nr_iomaps)
{
loff_t new_size = lcp->lc_last_wr + 1;
struct iattr iattr = { .ia_valid = 0 };
- struct iomap *iomaps;
- int nr_iomaps;
int error;
- nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
- lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
- if (nr_iomaps < 0)
- return nfserrno(nr_iomaps);
-
if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
lcp->lc_mtime = current_fs_time(inode->i_sb);
@@ -172,6 +137,54 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
return nfserrno(error);
}
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev;
+ struct pnfs_block_volume *b;
+
+ dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+ sizeof(struct pnfs_block_volume), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ gdp->gd_device = dev;
+
+ dev->nr_volumes = 1;
+ b = &dev->volumes[0];
+
+ b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+ b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+ return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+ &b->simple.offset);
+}
+
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+ struct nfs4_client *clp,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ if (sb->s_bdev != sb->s_bdev->bd_contains)
+ return nfserr_inval;
+ return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp)
+{
+ struct iomap *iomaps;
+ int nr_iomaps;
+
+ nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+ if (nr_iomaps < 0)
+ return nfserrno(nr_iomaps);
+
+ return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+
const struct nfsd4_layout_ops bl_layout_ops = {
/*
* Pretend that we send notification to the client. This is a blatant
@@ -190,3 +203,206 @@ const struct nfsd4_layout_ops bl_layout_ops = {
.encode_layoutget = nfsd4_block_encode_layoutget,
.proc_layoutcommit = nfsd4_block_proc_layoutcommit,
};
+#endif /* CONFIG_NFSD_BLOCKLAYOUT */
+
+#ifdef CONFIG_NFSD_SCSILAYOUT
+static int nfsd4_scsi_identify_device(struct block_device *bdev,
+ struct pnfs_block_volume *b)
+{
+ struct request_queue *q = bdev->bd_disk->queue;
+ struct request *rq;
+ size_t bufflen = 252, len, id_len;
+ u8 *buf, *d, type, assoc;
+ int error;
+
+ buf = kzalloc(bufflen, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ rq = blk_get_request(q, READ, GFP_KERNEL);
+ if (IS_ERR(rq)) {
+ error = -ENOMEM;
+ goto out_free_buf;
+ }
+ blk_rq_set_block_pc(rq);
+
+ error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
+ if (error)
+ goto out_put_request;
+
+ rq->cmd[0] = INQUIRY;
+ rq->cmd[1] = 1;
+ rq->cmd[2] = 0x83;
+ rq->cmd[3] = bufflen >> 8;
+ rq->cmd[4] = bufflen & 0xff;
+ rq->cmd_len = COMMAND_SIZE(INQUIRY);
+
+ error = blk_execute_rq(rq->q, NULL, rq, 1);
+ if (error) {
+ pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
+ rq->errors);
+ goto out_put_request;
+ }
+
+ len = (buf[2] << 8) + buf[3] + 4;
+ if (len > bufflen) {
+ pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n",
+ len);
+ goto out_put_request;
+ }
+
+ d = buf + 4;
+ for (d = buf + 4; d < buf + len; d += id_len + 4) {
+ id_len = d[3];
+ type = d[1] & 0xf;
+ assoc = (d[1] >> 4) & 0x3;
+
+ /*
+ * We only care about a EUI-64 and NAA designator types
+ * with LU association.
+ */
+ if (assoc != 0x00)
+ continue;
+ if (type != 0x02 && type != 0x03)
+ continue;
+ if (id_len != 8 && id_len != 12 && id_len != 16)
+ continue;
+
+ b->scsi.code_set = PS_CODE_SET_BINARY;
+ b->scsi.designator_type = type == 0x02 ?
+ PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA;
+ b->scsi.designator_len = id_len;
+ memcpy(b->scsi.designator, d + 4, id_len);
+
+ /*
+ * If we found a 8 or 12 byte descriptor continue on to
+ * see if a 16 byte one is available. If we find a
+ * 16 byte descriptor we're done.
+ */
+ if (id_len == 16)
+ break;
+ }
+
+out_put_request:
+ blk_put_request(rq);
+out_free_buf:
+ kfree(buf);
+ return error;
+}
+
+#define NFSD_MDS_PR_KEY 0x0100000000000000
+
+/*
+ * We use the client ID as a unique key for the reservations.
+ * This allows us to easily fence a client when recalls fail.
+ */
+static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp)
+{
+ return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id;
+}
+
+static int
+nfsd4_block_get_device_info_scsi(struct super_block *sb,
+ struct nfs4_client *clp,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev;
+ struct pnfs_block_volume *b;
+ const struct pr_ops *ops;
+ int error;
+
+ dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+ sizeof(struct pnfs_block_volume), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ gdp->gd_device = dev;
+
+ dev->nr_volumes = 1;
+ b = &dev->volumes[0];
+
+ b->type = PNFS_BLOCK_VOLUME_SCSI;
+ b->scsi.pr_key = nfsd4_scsi_pr_key(clp);
+
+ error = nfsd4_scsi_identify_device(sb->s_bdev, b);
+ if (error)
+ return error;
+
+ ops = sb->s_bdev->bd_disk->fops->pr_ops;
+ if (!ops) {
+ pr_err("pNFS: device %s does not support PRs.\n",
+ sb->s_id);
+ return -EINVAL;
+ }
+
+ error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true);
+ if (error) {
+ pr_err("pNFS: failed to register key for device %s.\n",
+ sb->s_id);
+ return -EINVAL;
+ }
+
+ error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY,
+ PR_EXCLUSIVE_ACCESS_REG_ONLY, 0);
+ if (error) {
+ pr_err("pNFS: failed to reserve device %s.\n",
+ sb->s_id);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __be32
+nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
+ struct nfs4_client *clp,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ if (sb->s_bdev != sb->s_bdev->bd_contains)
+ return nfserr_inval;
+ return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp));
+}
+static __be32
+nfsd4_scsi_proc_layoutcommit(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp)
+{
+ struct iomap *iomaps;
+ int nr_iomaps;
+
+ nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+ if (nr_iomaps < 0)
+ return nfserrno(nr_iomaps);
+
+ return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
+}
+
+static void
+nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
+{
+ struct nfs4_client *clp = ls->ls_stid.sc_client;
+ struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev;
+
+ bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
+ nfsd4_scsi_pr_key(clp), 0, true);
+}
+
+const struct nfsd4_layout_ops scsi_layout_ops = {
+ /*
+ * Pretend that we send notification to the client. This is a blatant
+ * lie to force recent Linux clients to cache our device IDs.
+ * We rarely ever change the device ID, so the harm of leaking deviceids
+ * for a while isn't too bad. Unfortunately RFC5661 is a complete mess
+ * in this regard, but I filed errata 4119 for this a while ago, and
+ * hopefully the Linux client will eventually start caching deviceids
+ * without this again.
+ */
+ .notify_types =
+ NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+ .proc_getdeviceinfo = nfsd4_scsi_proc_getdeviceinfo,
+ .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
+ .proc_layoutget = nfsd4_block_proc_layoutget,
+ .encode_layoutget = nfsd4_block_encode_layoutget,
+ .proc_layoutcommit = nfsd4_scsi_proc_layoutcommit,
+ .fence_client = nfsd4_scsi_fence_client,
+};
+#endif /* CONFIG_NFSD_SCSILAYOUT */
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6d834dc9bbc8..6c3b316f932e 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Christoph Hellwig.
+ * Copyright (c) 2014-2016 Christoph Hellwig.
*/
#include <linux/sunrpc/svc.h>
#include <linux/exportfs.h>
@@ -53,6 +53,18 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_encode_hyper(p, b->simple.offset);
p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
break;
+ case PNFS_BLOCK_VOLUME_SCSI:
+ len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
+ p = xdr_reserve_space(xdr, len);
+ if (!p)
+ return -ETOOSMALL;
+
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(b->scsi.code_set);
+ *p++ = cpu_to_be32(b->scsi.designator_type);
+ p = xdr_encode_opaque(p, b->scsi.designator, b->scsi.designator_len);
+ p = xdr_encode_hyper(p, b->scsi.pr_key);
+ break;
default:
return -ENOTSUPP;
}
@@ -93,18 +105,22 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
u32 block_size)
{
struct iomap *iomaps;
- u32 nr_iomaps, expected, i;
+ u32 nr_iomaps, i;
if (len < sizeof(u32)) {
dprintk("%s: extent array too small: %u\n", __func__, len);
return -EINVAL;
}
+ len -= sizeof(u32);
+ if (len % PNFS_BLOCK_EXTENT_SIZE) {
+ dprintk("%s: extent array invalid: %u\n", __func__, len);
+ return -EINVAL;
+ }
nr_iomaps = be32_to_cpup(p++);
- expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
- if (len != expected) {
+ if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
dprintk("%s: extent array size mismatch: %u/%u\n",
- __func__, len, expected);
+ __func__, len, nr_iomaps);
return -EINVAL;
}
@@ -155,3 +171,54 @@ fail:
kfree(iomaps);
return -EINVAL;
}
+
+int
+nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size)
+{
+ struct iomap *iomaps;
+ u32 nr_iomaps, expected, i;
+
+ if (len < sizeof(u32)) {
+ dprintk("%s: extent array too small: %u\n", __func__, len);
+ return -EINVAL;
+ }
+
+ nr_iomaps = be32_to_cpup(p++);
+ expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
+ if (len != expected) {
+ dprintk("%s: extent array size mismatch: %u/%u\n",
+ __func__, len, expected);
+ return -EINVAL;
+ }
+
+ iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+ if (!iomaps) {
+ dprintk("%s: failed to allocate extent array\n", __func__);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_iomaps; i++) {
+ u64 val;
+
+ p = xdr_decode_hyper(p, &val);
+ if (val & (block_size - 1)) {
+ dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
+ goto fail;
+ }
+ iomaps[i].offset = val;
+
+ p = xdr_decode_hyper(p, &val);
+ if (val & (block_size - 1)) {
+ dprintk("%s: unaligned length 0x%llx\n", __func__, val);
+ goto fail;
+ }
+ iomaps[i].length = val;
+ }
+
+ *iomapp = iomaps;
+ return nr_iomaps;
+fail:
+ kfree(iomaps);
+ return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 6de925fe8499..397bc7563a49 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -15,6 +15,11 @@ struct pnfs_block_extent {
enum pnfs_block_extent_state es;
};
+struct pnfs_block_range {
+ u64 foff;
+ u64 len;
+};
+
/*
* Random upper cap for the uuid length to avoid unbounded allocation.
* Not actually limited by the protocol.
@@ -29,6 +34,13 @@ struct pnfs_block_volume {
u32 sig_len;
u8 sig[PNFS_BLOCK_UUID_LEN];
} simple;
+ struct {
+ enum scsi_code_set code_set;
+ enum scsi_designator_type designator_type;
+ int designator_len;
+ u8 designator[256];
+ u64 pr_key;
+ } scsi;
};
};
@@ -43,5 +55,7 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
struct nfsd4_layoutget *lgp);
int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
u32 block_size);
+int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size);
#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 77e7a5cca888..1a03bc3059e8 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -58,7 +58,7 @@ nlm_fclose(struct file *filp)
fput(filp);
}
-static struct nlmsvc_binding nfsd_nlm_ops = {
+static const struct nlmsvc_binding nfsd_nlm_ops = {
.fopen = nlm_fopen, /* open file for locking */
.fclose = nlm_fclose, /* close file */
};
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index d8b16c2568f3..5fbf3bbd00d0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -92,7 +92,7 @@ struct nfsd_net {
struct file *rec_file;
bool in_grace;
- struct nfsd4_client_tracking_ops *client_tracking_ops;
+ const struct nfsd4_client_tracking_ops *client_tracking_ops;
time_t nfsd4_lease;
time_t nfsd4_grace;
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7b755b7f785c..51c3b06e8036 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -147,6 +147,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
{
__be32 nfserr;
u32 max_blocksize = svc_max_payload(rqstp);
+ unsigned long cnt = min(argp->count, max_blocksize);
dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
SVCFH_fmt(&argp->fh),
@@ -157,7 +158,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
* 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
* + 1 (xdr opaque byte count) = 26
*/
- resp->count = min(argp->count, max_blocksize);
+ resp->count = cnt;
svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
fh_copy(&resp->fh, &argp->fh);
@@ -167,8 +168,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
&resp->count);
if (nfserr == 0) {
struct inode *inode = d_inode(resp->fh.fh_dentry);
-
- resp->eof = (argp->offset + resp->count) >= inode->i_size;
+ resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset,
+ inode->i_size);
}
RETURN_STATUS(nfserr);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..2246454dec76 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -262,11 +262,11 @@ void fill_post_wcc(struct svc_fh *fhp)
err = fh_getattr(fhp, &fhp->fh_post_attr);
fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
if (err) {
- fhp->fh_post_saved = 0;
+ fhp->fh_post_saved = false;
/* Grab the ctime anyway - set_change_info might use it */
fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
} else
- fhp->fh_post_saved = 1;
+ fhp->fh_post_saved = true;
}
/*
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
- dchild = lookup_one_len(name, dparent, namlen);
+ dchild = lookup_one_len_unlocked(name, dparent, namlen);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e7f50c4081d6..7389cb1d7409 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -792,12 +792,16 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
{
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
clp->cl_cb_state = NFSD4_CB_DOWN;
warn_no_callback_path(clp, reason);
}
static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
{
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
clp->cl_cb_state = NFSD4_CB_FAULT;
warn_no_callback_path(clp, reason);
}
@@ -1143,7 +1147,7 @@ nfsd4_run_cb_work(struct work_struct *work)
}
void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
- struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
{
cb->cb_clp = clp;
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ebf90e487c75..825c7bc8d789 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
+#include <linux/blkdev.h>
#include <linux/kmod.h>
#include <linux/file.h>
#include <linux/jhash.h>
@@ -22,11 +23,16 @@ struct nfs4_layout {
static struct kmem_cache *nfs4_layout_cache;
static struct kmem_cache *nfs4_layout_stateid_cache;
-static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
static const struct lock_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
[LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+ [LAYOUT_SCSI] = &scsi_layout_ops,
+#endif
};
/* pNFS device ID to export fsid mapping */
@@ -121,10 +127,24 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
if (!(exp->ex_flags & NFSEXP_PNFS))
return;
+ /*
+ * Check if the file system supports exporting a block-like layout.
+ * If the block device supports reservations prefer the SCSI layout,
+ * otherwise advertise the block layout.
+ */
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
if (sb->s_export_op->get_uuid &&
sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks)
exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+ /* overwrite block layout selection if needed */
+ if (sb->s_export_op->map_blocks &&
+ sb->s_export_op->commit_blocks &&
+ sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
+ exp->ex_layout_type = LAYOUT_SCSI;
+#endif
}
static void
@@ -201,6 +221,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
INIT_LIST_HEAD(&ls->ls_perfile);
spin_lock_init(&ls->ls_lock);
INIT_LIST_HEAD(&ls->ls_layouts);
+ mutex_init(&ls->ls_mutex);
ls->ls_layout_type = layout_type;
nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
NFSPROC4_CLNT_CB_LAYOUT);
@@ -262,19 +283,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
status = nfserr_jukebox;
if (!ls)
goto out;
+ mutex_lock(&ls->ls_mutex);
} else {
ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
status = nfserr_bad_stateid;
+ mutex_lock(&ls->ls_mutex);
if (stateid->si_generation > stid->sc_stateid.si_generation)
- goto out_put_stid;
+ goto out_unlock_stid;
if (layout_type != ls->ls_layout_type)
- goto out_put_stid;
+ goto out_unlock_stid;
}
*lsp = ls;
return 0;
+out_unlock_stid:
+ mutex_unlock(&ls->ls_mutex);
out_put_stid:
nfs4_put_stid(stid);
out:
@@ -296,8 +321,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
trace_layout_recall(&ls->ls_stid.sc_stateid);
atomic_inc(&ls->ls_stid.sc_count);
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
nfsd4_run_cb(&ls->ls_recall);
out_unlock:
@@ -406,8 +429,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
list_add_tail(&new->lo_perstate, &ls->ls_layouts);
new = NULL;
done:
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&lgp->lg_sid, &ls->ls_stid);
spin_unlock(&ls->ls_lock);
out:
spin_unlock(&fp->fi_lock);
@@ -481,11 +503,8 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
}
}
if (!list_empty(&ls->ls_layouts)) {
- if (found) {
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
- sizeof(stateid_t));
- }
+ if (found)
+ nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
lrp->lrs_present = 1;
} else {
trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
@@ -494,6 +513,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
}
spin_unlock(&ls->ls_lock);
+ mutex_unlock(&ls->ls_mutex);
nfs4_put_stid(&ls->ls_stid);
nfsd4_free_layouts(&reaplist);
return nfs_ok;
@@ -590,8 +610,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
- trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
-
printk(KERN_WARNING
"nfsd: client %s failed to respond to layout recall. "
" Fencing..\n", addr_str);
@@ -608,34 +626,67 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
}
}
+static void
+nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+
+ mutex_lock(&ls->ls_mutex);
+ nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
+ mutex_unlock(&ls->ls_mutex);
+}
+
static int
nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
struct nfs4_layout_stateid *ls =
container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ struct nfsd_net *nn;
+ ktime_t now, cutoff;
+ const struct nfsd4_layout_ops *ops;
LIST_HEAD(reaplist);
+
switch (task->tk_status) {
case 0:
- return 1;
+ case -NFS4ERR_DELAY:
+ /*
+ * Anything left? If not, then call it done. Note that we don't
+ * take the spinlock since this is an optimization and nothing
+ * should get added until the cb counter goes to zero.
+ */
+ if (list_empty(&ls->ls_layouts))
+ return 1;
+
+ /* Poll the client until it's done with the layout */
+ now = ktime_get();
+ nn = net_generic(ls->ls_stid.sc_client->net, nfsd_net_id);
+
+ /* Client gets 2 lease periods to return it */
+ cutoff = ktime_add_ns(task->tk_start,
+ nn->nfsd4_lease * NSEC_PER_SEC * 2);
+
+ if (ktime_before(now, cutoff)) {
+ rpc_delay(task, HZ/100); /* 10 mili-seconds */
+ return 0;
+ }
+ /* Fallthrough */
case -NFS4ERR_NOMATCHING_LAYOUT:
trace_layout_recall_done(&ls->ls_stid.sc_stateid);
task->tk_status = 0;
return 1;
- case -NFS4ERR_DELAY:
- /* Poll the client until it's done with the layout */
- /* FIXME: cap number of retries.
- * The pnfs standard states that we need to only expire
- * the client after at-least "lease time" .eg lease-time * 2
- * when failing to communicate a recall
- */
- rpc_delay(task, HZ/100); /* 10 mili-seconds */
- return 0;
default:
/*
* Unknown error or non-responding client, we'll need to fence.
*/
- nfsd4_cb_layout_fail(ls);
+ trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
+
+ ops = nfsd4_layout_ops[ls->ls_layout_type];
+ if (ops->fence_client)
+ ops->fence_client(ls);
+ else
+ nfsd4_cb_layout_fail(ls);
return -1;
}
}
@@ -654,7 +705,8 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
nfs4_put_stid(&ls->ls_stid);
}
-static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+ .prepare = nfsd4_cb_layout_prepare,
.done = nfsd4_cb_layout_done,
.release = nfsd4_cb_layout_release,
};
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4ce6b97b31ad..de1ff1d98bb1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -55,10 +55,10 @@ nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u
struct inode *inode = d_inode(resfh->fh_dentry);
int status;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
status = security_inode_setsecctx(resfh->fh_dentry,
label->data, label->len);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (status)
/*
@@ -774,8 +774,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* check stateid */
- status = nfs4_preprocess_stateid_op(rqstp, cstate, &read->rd_stateid,
- RD_STATE, &read->rd_filp, &read->rd_tmp_file);
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &read->rd_stateid, RD_STATE,
+ &read->rd_filp, &read->rd_tmp_file);
if (status) {
dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
goto out;
@@ -863,12 +864,10 @@ static __be32
nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_secinfo *secinfo)
{
- struct svc_fh resfh;
struct svc_export *exp;
struct dentry *dentry;
__be32 err;
- fh_init(&resfh, NFS4_FHSIZE);
err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
if (err)
return err;
@@ -877,6 +876,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&exp, &dentry);
if (err)
return err;
+ fh_unlock(&cstate->current_fh);
if (d_really_is_negative(dentry)) {
exp_put(exp);
err = nfserr_noent;
@@ -921,7 +921,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
status = nfs4_preprocess_stateid_op(rqstp, cstate,
- &setattr->sa_stateid, WR_STATE, NULL, NULL);
+ &cstate->current_fh, &setattr->sa_stateid,
+ WR_STATE, NULL, NULL);
if (status) {
dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
return status;
@@ -985,8 +986,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (write->wr_offset >= OFFSET_MAX)
return nfserr_inval;
- status = nfs4_preprocess_stateid_op(rqstp, cstate, stateid, WR_STATE,
- &filp, NULL);
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ stateid, WR_STATE, &filp, NULL);
if (status) {
dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
return status;
@@ -1010,13 +1011,54 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
static __be32
+nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_clone *clone)
+{
+ struct file *src, *dst;
+ __be32 status;
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
+ &clone->cl_src_stateid, RD_STATE,
+ &src, NULL);
+ if (status) {
+ dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
+ goto out;
+ }
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &clone->cl_dst_stateid, WR_STATE,
+ &dst, NULL);
+ if (status) {
+ dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
+ goto out_put_src;
+ }
+
+ /* fix up for NFS-specific error code */
+ if (!S_ISREG(file_inode(src)->i_mode) ||
+ !S_ISREG(file_inode(dst)->i_mode)) {
+ status = nfserr_wrong_type;
+ goto out_put_dst;
+ }
+
+ status = nfsd4_clone_file_range(src, clone->cl_src_pos,
+ dst, clone->cl_dst_pos, clone->cl_count);
+
+out_put_dst:
+ fput(dst);
+out_put_src:
+ fput(src);
+out:
+ return status;
+}
+
+static __be32
nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_fallocate *fallocate, int flags)
{
__be32 status = nfserr_notsupp;
struct file *file;
- status = nfs4_preprocess_stateid_op(rqstp, cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&fallocate->falloc_stateid,
WR_STATE, &file, NULL);
if (status != nfs_ok) {
@@ -1055,7 +1097,7 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct file *file;
- status = nfs4_preprocess_stateid_op(rqstp, cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
&seek->seek_stateid,
RD_STATE, &file, NULL);
if (status) {
@@ -1226,8 +1268,10 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
goto out;
nfserr = nfs_ok;
- if (gdp->gd_maxcount != 0)
- nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+ if (gdp->gd_maxcount != 0) {
+ nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
+ cstate->session->se_client, gdp);
+ }
gdp->gd_notify_types &= ops->notify_types;
out:
@@ -1309,6 +1353,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
nfserr = nfsd4_insert_layout(lgp, ls);
out_put_stid:
+ mutex_unlock(&ls->ls_mutex);
nfs4_put_stid(&ls->ls_stid);
out:
return nfserr;
@@ -1362,6 +1407,9 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
goto out;
}
+ /* LAYOUTCOMMIT does not require any serialization */
+ mutex_unlock(&ls->ls_mutex);
+
if (new_size > i_size_read(inode)) {
lcp->lc_size_chg = 1;
lcp->lc_newsize = new_size;
@@ -2275,6 +2323,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_DEALLOCATE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
+ [OP_CLONE] = {
+ .op_func = (nfsd4op_func)nfsd4_clone,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_CLONE",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+ },
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index e3d47091b191..66eaeb1e8c2c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,10 +32,10 @@
*
*/
+#include <crypto/hash.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/namei.h>
-#include <linux/crypto.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/module.h>
@@ -104,29 +104,35 @@ static int
nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
{
struct xdr_netobj cksum;
- struct hash_desc desc;
- struct scatterlist sg;
+ struct crypto_shash *tfm;
int status;
dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
clname->len, clname->data);
- desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(desc.tfm)) {
- status = PTR_ERR(desc.tfm);
+ tfm = crypto_alloc_shash("md5", 0, 0);
+ if (IS_ERR(tfm)) {
+ status = PTR_ERR(tfm);
goto out_no_tfm;
}
- cksum.len = crypto_hash_digestsize(desc.tfm);
+ cksum.len = crypto_shash_digestsize(tfm);
cksum.data = kmalloc(cksum.len, GFP_KERNEL);
if (cksum.data == NULL) {
status = -ENOMEM;
goto out;
}
- sg_init_one(&sg, clname->data, clname->len);
+ {
+ SHASH_DESC_ON_STACK(desc, tfm);
+
+ desc->tfm = tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ status = crypto_shash_digest(desc, clname->data, clname->len,
+ cksum.data);
+ shash_desc_zero(desc);
+ }
- status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data);
if (status)
goto out;
@@ -135,7 +141,7 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
status = 0;
out:
kfree(cksum.data);
- crypto_free_hash(desc.tfm);
+ crypto_free_shash(tfm);
out_no_tfm:
return status;
}
@@ -192,7 +198,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
dir = nn->rec_file->f_path.dentry;
/* lock the parent */
- mutex_lock(&d_inode(dir)->i_mutex);
+ inode_lock(d_inode(dir));
dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
if (IS_ERR(dentry)) {
@@ -213,7 +219,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
out_put:
dput(dentry);
out_unlock:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
if (status == 0) {
if (nn->in_grace) {
crp = nfs4_client_to_reclaim(dname, nn);
@@ -286,7 +292,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
}
status = iterate_dir(nn->rec_file, &ctx.ctx);
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
if (!status) {
@@ -302,7 +308,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
list_del(&entry->list);
kfree(entry);
}
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
nfs4_reset_creds(original_cred);
list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
@@ -322,7 +328,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
dir = nn->rec_file->f_path.dentry;
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
dentry = lookup_one_len(name, dir, namlen);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
@@ -335,7 +341,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
out:
dput(dentry);
out_unlock:
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
return status;
}
@@ -631,7 +637,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
return -ENOENT;
}
-static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
.init = nfsd4_legacy_tracking_init,
.exit = nfsd4_legacy_tracking_exit,
.create = nfsd4_create_clid_dir,
@@ -1050,7 +1056,7 @@ out_err:
printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
}
-static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
.init = nfsd4_init_cld_pipe,
.exit = nfsd4_remove_cld_pipe,
.create = nfsd4_cld_create,
@@ -1260,6 +1266,7 @@ nfsd4_umh_cltrack_init(struct net *net)
/* XXX: The usermode helper s not working in container yet. */
if (net != &init_net) {
pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n");
+ kfree(grace_start);
return -EINVAL;
}
@@ -1394,7 +1401,7 @@ nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
kfree(legacy);
}
-static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
.init = nfsd4_umh_cltrack_init,
.exit = NULL,
.create = nfsd4_umh_cltrack_create,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0f1d5691b795..0462eeddfff9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -98,7 +98,7 @@ static struct kmem_cache *odstate_slab;
static void free_session(struct nfsd4_session *);
-static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
static bool is_session_dead(struct nfsd4_session *ses)
{
@@ -575,6 +575,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
/* Will be incremented before return to client: */
atomic_set(&stid->sc_count, 1);
+ spin_lock_init(&stid->sc_lock);
/*
* It shouldn't be a problem to reuse an opaque stateid value.
@@ -745,6 +746,18 @@ nfs4_put_stid(struct nfs4_stid *s)
put_nfs4_file(fp);
}
+void
+nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
+{
+ stateid_t *src = &stid->sc_stateid;
+
+ spin_lock(&stid->sc_lock);
+ if (unlikely(++src->si_generation == 0))
+ src->si_generation = 1;
+ memcpy(dst, src, sizeof(*dst));
+ spin_unlock(&stid->sc_lock);
+}
+
static void nfs4_put_deleg_lease(struct nfs4_file *fp)
{
struct file *filp = NULL;
@@ -765,16 +778,68 @@ void nfs4_unhash_stid(struct nfs4_stid *s)
s->sc_type = 0;
}
-static void
+/**
+ * nfs4_get_existing_delegation - Discover if this delegation already exists
+ * @clp: a pointer to the nfs4_client we're granting a delegation to
+ * @fp: a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ * On success: NULL if an existing delegation was not found.
+ *
+ * On error: -EAGAIN if one was previously granted to this nfs4_client
+ * for this nfs4_file.
+ *
+ */
+
+static int
+nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+ struct nfs4_delegation *searchdp = NULL;
+ struct nfs4_client *searchclp = NULL;
+
+ lockdep_assert_held(&state_lock);
+ lockdep_assert_held(&fp->fi_lock);
+
+ list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
+ searchclp = searchdp->dl_stid.sc_client;
+ if (clp == searchclp) {
+ return -EAGAIN;
+ }
+ }
+ return 0;
+}
+
+/**
+ * hash_delegation_locked - Add a delegation to the appropriate lists
+ * @dp: a pointer to the nfs4_delegation we are adding.
+ * @fp: a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ * On success: NULL if the delegation was successfully hashed.
+ *
+ * On error: -EAGAIN if one was previously granted to this
+ * nfs4_client for this nfs4_file. Delegation is not hashed.
+ *
+ */
+
+static int
hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
{
+ int status;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+
lockdep_assert_held(&state_lock);
lockdep_assert_held(&fp->fi_lock);
+ status = nfs4_get_existing_delegation(clp, fp);
+ if (status)
+ return status;
+ ++fp->fi_delegees;
atomic_inc(&dp->dl_stid.sc_count);
dp->dl_stid.sc_type = NFS4_DELEG_STID;
list_add(&dp->dl_perfile, &fp->fi_delegations);
- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+ list_add(&dp->dl_perclnt, &clp->cl_delegations);
+ return 0;
}
static bool
@@ -1792,15 +1857,28 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
target->cl_clientid.cl_id = source->cl_clientid.cl_id;
}
-static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+int strdup_if_nonnull(char **target, char *source)
{
- if (source->cr_principal) {
- target->cr_principal =
- kstrdup(source->cr_principal, GFP_KERNEL);
- if (target->cr_principal == NULL)
+ if (source) {
+ *target = kstrdup(source, GFP_KERNEL);
+ if (!*target)
return -ENOMEM;
} else
- target->cr_principal = NULL;
+ *target = NULL;
+ return 0;
+}
+
+static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
+ int ret;
+
+ ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal);
+ if (ret)
+ return ret;
+ ret = strdup_if_nonnull(&target->cr_raw_principal,
+ source->cr_raw_principal);
+ if (ret)
+ return ret;
target->cr_flavor = source->cr_flavor;
target->cr_uid = source->cr_uid;
target->cr_gid = source->cr_gid;
@@ -1904,6 +1982,9 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
return false;
if (!svc_rqst_integrity_protected(rqstp))
return false;
+ if (cl->cl_cred.cr_raw_principal)
+ return 0 == strcmp(cl->cl_cred.cr_raw_principal,
+ cr->cr_raw_principal);
if (!cr->cr_principal)
return false;
return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
@@ -2175,7 +2256,8 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
base = resp->cstate.data_offset;
slot->sl_datalen = buf->len - base;
if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
- WARN("%s: sessions DRC could not cache compound\n", __func__);
+ WARN(1, "%s: sessions DRC could not cache compound\n",
+ __func__);
return;
}
@@ -2256,15 +2338,20 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
clid->flags = new->cl_exchange_flags;
}
+static bool client_has_openowners(struct nfs4_client *clp)
+{
+ struct nfs4_openowner *oo;
+
+ list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) {
+ if (!list_empty(&oo->oo_owner.so_stateids))
+ return true;
+ }
+ return false;
+}
+
static bool client_has_state(struct nfs4_client *clp)
{
- /*
- * Note clp->cl_openowners check isn't quite right: there's no
- * need to count owners without stateid's.
- *
- * Also note we should probably be using this in 4.0 case too.
- */
- return !list_empty(&clp->cl_openowners)
+ return client_has_openowners(clp)
#ifdef CONFIG_NFSD_PNFS
|| !list_empty(&clp->cl_lo_states)
#endif
@@ -2295,22 +2382,36 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
return nfserr_inval;
+ new = create_client(exid->clname, rqstp, &verf);
+ if (new == NULL)
+ return nfserr_jukebox;
+
switch (exid->spa_how) {
case SP4_MACH_CRED:
- if (!svc_rqst_integrity_protected(rqstp))
- return nfserr_inval;
+ if (!svc_rqst_integrity_protected(rqstp)) {
+ status = nfserr_inval;
+ goto out_nolock;
+ }
+ /*
+ * Sometimes userspace doesn't give us a principal.
+ * Which is a bug, really. Anyway, we can't enforce
+ * MACH_CRED in that case, better to give up now:
+ */
+ if (!new->cl_cred.cr_principal &&
+ !new->cl_cred.cr_raw_principal) {
+ status = nfserr_serverfault;
+ goto out_nolock;
+ }
+ new->cl_mach_cred = true;
case SP4_NONE:
break;
default: /* checked by xdr code */
WARN_ON_ONCE(1);
case SP4_SSV:
- return nfserr_encr_alg_unsupp;
+ status = nfserr_encr_alg_unsupp;
+ goto out_nolock;
}
- new = create_client(exid->clname, rqstp, &verf);
- if (new == NULL)
- return nfserr_jukebox;
-
/* Cases below refer to rfc 5661 section 18.35.4: */
spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&exid->clname, nn);
@@ -2372,7 +2473,6 @@ out_new:
goto out;
}
new->cl_minorversion = cstate->minorversion;
- new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
gen_clid(new, nn);
add_to_unconfirmed(new);
@@ -2390,6 +2490,7 @@ out_copy:
out:
spin_unlock(&nn->client_lock);
+out_nolock:
if (new)
expire_client(new);
if (unconf)
@@ -2486,21 +2587,26 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
return nfs_ok;
}
+/*
+ * Server's NFSv4.1 backchannel support is AUTH_SYS-only for now.
+ * These are based on similar macros in linux/sunrpc/msg_prot.h .
+ */
+#define RPC_MAX_HEADER_WITH_AUTH_SYS \
+ (RPC_CALLHDRSIZE + 2 * (2 + UNX_CALLSLACK))
+
+#define RPC_MAX_REPHEADER_WITH_AUTH_SYS \
+ (RPC_REPHDRSIZE + (2 + NUL_REPLYSLACK))
+
#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \
- RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32))
+ RPC_MAX_HEADER_WITH_AUTH_SYS) * sizeof(__be32))
#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \
- RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32))
+ RPC_MAX_REPHEADER_WITH_AUTH_SYS) * \
+ sizeof(__be32))
static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
{
ca->headerpadsz = 0;
- /*
- * These RPC_MAX_HEADER macros are overkill, especially since we
- * don't even do gss on the backchannel yet. But this is still
- * less than 1k. Tighten up this estimate in the unlikely event
- * it turns out to be a problem for some client:
- */
if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
return nfserr_toosmall;
if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
@@ -2610,10 +2716,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
}
status = nfs_ok;
- /*
- * We do not support RDMA or persistent sessions
- */
+ /* Persistent sessions are not supported */
cr_ses->flags &= ~SESSION4_PERSIST;
+ /* Upshifting from TCP to RDMA is not supported */
cr_ses->flags &= ~SESSION4_RDMA;
init_session(rqstp, new, conf, cr_ses);
@@ -3049,7 +3154,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Cases below refer to rfc 3530 section 14.2.33: */
spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&clname, nn);
- if (conf) {
+ if (conf && client_has_state(conf)) {
/* case 0: */
status = nfserr_clid_inuse;
if (clp_used_exchangeid(conf))
@@ -3136,6 +3241,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
} else { /* case 3: normal case; new or rebooted client */
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old) {
+ status = nfserr_clid_inuse;
+ if (client_has_state(old)
+ && !same_creds(&unconf->cl_cred,
+ &old->cl_cred))
+ goto out;
status = mark_client_expired_locked(old);
if (status) {
old = NULL;
@@ -3317,6 +3427,27 @@ static const struct nfs4_stateowner_operations openowner_ops = {
.so_free = nfs4_free_openowner,
};
+static struct nfs4_ol_stateid *
+nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+ struct nfs4_ol_stateid *local, *ret = NULL;
+ struct nfs4_openowner *oo = open->op_openowner;
+
+ lockdep_assert_held(&fp->fi_lock);
+
+ list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
+ /* ignore lock owners */
+ if (local->st_stateowner->so_is_open_owner == 0)
+ continue;
+ if (local->st_stateowner == &oo->oo_owner) {
+ ret = local;
+ atomic_inc(&ret->st_stid.sc_count);
+ break;
+ }
+ }
+ return ret;
+}
+
static struct nfs4_openowner *
alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
struct nfsd4_compound_state *cstate)
@@ -3348,9 +3479,20 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
return ret;
}
-static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+static struct nfs4_ol_stateid *
+init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
+ struct nfsd4_open *open)
+{
+
struct nfs4_openowner *oo = open->op_openowner;
+ struct nfs4_ol_stateid *retstp = NULL;
+ spin_lock(&oo->oo_owner.so_client->cl_lock);
+ spin_lock(&fp->fi_lock);
+
+ retstp = nfsd4_find_existing_open(fp, open);
+ if (retstp)
+ goto out_unlock;
atomic_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_OPEN_STID;
INIT_LIST_HEAD(&stp->st_locks);
@@ -3360,12 +3502,14 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
stp->st_openstp = NULL;
- spin_lock(&oo->oo_owner.so_client->cl_lock);
+ init_rwsem(&stp->st_rwsem);
list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
- spin_lock(&fp->fi_lock);
list_add(&stp->st_perfile, &fp->fi_stateids);
+
+out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&oo->oo_owner.so_client->cl_lock);
+ return retstp;
}
/*
@@ -3539,7 +3683,7 @@ static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
nfs4_put_stid(&dp->dl_stid);
}
-static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
.prepare = nfsd4_cb_recall_prepare,
.done = nfsd4_cb_recall_done,
.release = nfsd4_cb_recall_release,
@@ -3776,27 +3920,6 @@ out:
return nfs_ok;
}
-static struct nfs4_ol_stateid *
-nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
-{
- struct nfs4_ol_stateid *local, *ret = NULL;
- struct nfs4_openowner *oo = open->op_openowner;
-
- spin_lock(&fp->fi_lock);
- list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
- /* ignore lock owners */
- if (local->st_stateowner->so_is_open_owner == 0)
- continue;
- if (local->st_stateowner == &oo->oo_owner) {
- ret = local;
- atomic_inc(&ret->st_stid.sc_count);
- break;
- }
- }
- spin_unlock(&fp->fi_lock);
- return ret;
-}
-
static inline int nfs4_access_to_access(u32 nfs4_access)
{
int flags = 0;
@@ -3945,6 +4068,18 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
return fl;
}
+/**
+ * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer
+ * @dp: a pointer to the nfs4_delegation we're adding.
+ *
+ * Return:
+ * On success: Return code will be 0 on success.
+ *
+ * On error: -EAGAIN if there was an existing delegation.
+ * nonzero if there is an error in other cases.
+ *
+ */
+
static int nfs4_setlease(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_stid.sc_file;
@@ -3976,16 +4111,19 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
goto out_unlock;
/* Race breaker */
if (fp->fi_deleg_file) {
- status = 0;
- ++fp->fi_delegees;
- hash_delegation_locked(dp, fp);
+ status = hash_delegation_locked(dp, fp);
goto out_unlock;
}
fp->fi_deleg_file = filp;
- fp->fi_delegees = 1;
- hash_delegation_locked(dp, fp);
+ fp->fi_delegees = 0;
+ status = hash_delegation_locked(dp, fp);
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
+ if (status) {
+ /* Should never happen, this is a new fi_deleg_file */
+ WARN_ON_ONCE(1);
+ goto out_fput;
+ }
return 0;
out_unlock:
spin_unlock(&fp->fi_lock);
@@ -4005,6 +4143,15 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
if (fp->fi_had_conflict)
return ERR_PTR(-EAGAIN);
+ spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+ status = nfs4_get_existing_delegation(clp, fp);
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
+
+ if (status)
+ return ERR_PTR(status);
+
dp = alloc_init_deleg(clp, fh, odstate);
if (!dp)
return ERR_PTR(-ENOMEM);
@@ -4023,9 +4170,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
status = -EAGAIN;
goto out_unlock;
}
- ++fp->fi_delegees;
- hash_delegation_locked(dp, fp);
- status = 0;
+ status = hash_delegation_locked(dp, fp);
out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
@@ -4160,6 +4305,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
struct nfs4_file *fp = NULL;
struct nfs4_ol_stateid *stp = NULL;
+ struct nfs4_ol_stateid *swapstp = NULL;
struct nfs4_delegation *dp = NULL;
__be32 status;
@@ -4173,7 +4319,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs4_check_deleg(cl, open, &dp);
if (status)
goto out;
+ spin_lock(&fp->fi_lock);
stp = nfsd4_find_existing_open(fp, open);
+ spin_unlock(&fp->fi_lock);
} else {
open->op_file = NULL;
status = nfserr_bad_stateid;
@@ -4187,15 +4335,32 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
*/
if (stp) {
/* Stateid was found, this is an OPEN upgrade */
+ down_read(&stp->st_rwsem);
status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
- if (status)
+ if (status) {
+ up_read(&stp->st_rwsem);
goto out;
+ }
} else {
stp = open->op_stp;
open->op_stp = NULL;
- init_open_stateid(stp, fp, open);
+ swapstp = init_open_stateid(stp, fp, open);
+ if (swapstp) {
+ nfs4_put_stid(&stp->st_stid);
+ stp = swapstp;
+ down_read(&stp->st_rwsem);
+ status = nfs4_upgrade_open(rqstp, fp, current_fh,
+ stp, open);
+ if (status) {
+ up_read(&stp->st_rwsem);
+ goto out;
+ }
+ goto upgrade_out;
+ }
+ down_read(&stp->st_rwsem);
status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
if (status) {
+ up_read(&stp->st_rwsem);
release_open_stateid(stp);
goto out;
}
@@ -4205,8 +4370,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
if (stp->st_clnt_odstate == open->op_odstate)
open->op_odstate = NULL;
}
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+upgrade_out:
+ nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
+ up_read(&stp->st_rwsem);
if (nfsd4_has_session(&resp->cstate)) {
if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
@@ -4410,8 +4576,7 @@ static void
laundromat_main(struct work_struct *laundry)
{
time_t t;
- struct delayed_work *dwork = container_of(laundry, struct delayed_work,
- work);
+ struct delayed_work *dwork = to_delayed_work(laundry);
struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
laundromat_work);
@@ -4666,10 +4831,9 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
*/
__be32
nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
- struct nfsd4_compound_state *cstate, stateid_t *stateid,
- int flags, struct file **filpp, bool *tmp_file)
+ struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
+ stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file)
{
- struct svc_fh *fhp = &cstate->current_fh;
struct inode *ino = d_inode(fhp->fh_dentry);
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -4819,10 +4983,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
* revoked delegations are kept only for free_stateid.
*/
return nfserr_bad_stateid;
+ down_write(&stp->st_rwsem);
status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
- if (status)
- return status;
- return nfs4_check_fh(current_fh, &stp->st_stid);
+ if (status == nfs_ok)
+ status = nfs4_check_fh(current_fh, &stp->st_stid);
+ if (status != nfs_ok)
+ up_write(&stp->st_rwsem);
+ return status;
}
/*
@@ -4869,6 +5036,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
return status;
oo = openowner(stp->st_stateowner);
if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
return nfserr_bad_stateid;
}
@@ -4899,11 +5067,13 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
oo = openowner(stp->st_stateowner);
status = nfserr_bad_stateid;
- if (oo->oo_flags & NFS4_OO_CONFIRMED)
+ if (oo->oo_flags & NFS4_OO_CONFIRMED) {
+ up_write(&stp->st_rwsem);
goto put_stateid;
+ }
oo->oo_flags |= NFS4_OO_CONFIRMED;
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid);
+ up_write(&stp->st_rwsem);
dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
@@ -4975,13 +5145,11 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
goto put_stateid;
}
nfs4_stateid_downgrade(stp, od->od_share_access);
-
reset_union_bmap_deny(od->od_share_deny, stp);
-
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid);
status = nfs_ok;
put_stateid:
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
@@ -5033,8 +5201,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfsd4_bump_seqid(cstate, status);
if (status)
goto out;
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
+ up_write(&stp->st_rwsem);
nfsd4_close_open_stateid(stp);
@@ -5260,6 +5428,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
+ init_rwsem(&stp->st_rwsem);
list_add(&stp->st_locks, &open_stp->st_locks);
list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
spin_lock(&fp->fi_lock);
@@ -5428,6 +5597,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&open_stp, nn);
if (status)
goto out;
+ up_write(&open_stp->st_rwsem);
open_sop = openowner(open_stp->st_stateowner);
status = nfserr_bad_stateid;
if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
@@ -5435,6 +5605,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
status = lookup_or_create_lock_state(cstate, open_stp, lock,
&lock_stp, &new);
+ if (status == nfs_ok)
+ down_write(&lock_stp->st_rwsem);
} else {
status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
@@ -5512,9 +5684,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
switch (-err) {
case 0: /* success! */
- update_stateid(&lock_stp->st_stid.sc_stateid);
- memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid,
- sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
status = 0;
break;
case (EAGAIN): /* conflock holds conflicting lock */
@@ -5540,6 +5710,8 @@ out:
seqid_mutating_err(ntohl(status)))
lock_sop->lo_owner.so_seqid++;
+ up_write(&lock_stp->st_rwsem);
+
/*
* If this is a new, never-before-used stateid, and we are
* returning an error, then just go ahead and release it.
@@ -5704,11 +5876,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
goto out_nfserr;
}
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid);
fput:
fput(filp);
put_stateid:
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..9df898ba648f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1072,8 +1072,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
READ_BUF(4);
rename->rn_snamelen = be32_to_cpup(p++);
- READ_BUF(rename->rn_snamelen + 4);
+ READ_BUF(rename->rn_snamelen);
SAVEMEM(rename->rn_sname, rename->rn_snamelen);
+ READ_BUF(4);
rename->rn_tnamelen = be32_to_cpup(p++);
READ_BUF(rename->rn_tnamelen);
SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
@@ -1155,13 +1156,14 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
READ_BUF(8);
setclientid->se_callback_prog = be32_to_cpup(p++);
setclientid->se_callback_netid_len = be32_to_cpup(p++);
-
- READ_BUF(setclientid->se_callback_netid_len + 4);
+ READ_BUF(setclientid->se_callback_netid_len);
SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
+ READ_BUF(4);
setclientid->se_callback_addr_len = be32_to_cpup(p++);
- READ_BUF(setclientid->se_callback_addr_len + 4);
+ READ_BUF(setclientid->se_callback_addr_len);
SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
+ READ_BUF(4);
setclientid->se_callback_ident = be32_to_cpup(p++);
DECODE_TAIL;
@@ -1675,6 +1677,25 @@ nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
}
static __be32
+nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
+{
+ DECODE_HEAD;
+
+ status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid);
+ if (status)
+ return status;
+ status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(8 + 8 + 8);
+ p = xdr_decode_hyper(p, &clone->cl_src_pos);
+ p = xdr_decode_hyper(p, &clone->cl_dst_pos);
+ p = xdr_decode_hyper(p, &clone->cl_count);
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
{
DECODE_HEAD;
@@ -1785,6 +1806,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
[OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone,
};
static inline bool
@@ -1815,8 +1837,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
READ_BUF(4);
argp->taglen = be32_to_cpup(p++);
- READ_BUF(argp->taglen + 8);
+ READ_BUF(argp->taglen);
SAVEMEM(argp->tag, argp->taglen);
+ READ_BUF(8);
argp->minorversion = be32_to_cpup(p++);
argp->opcnt = be32_to_cpup(p++);
max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
@@ -2838,14 +2861,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
if (d_really_is_negative(dentry)) {
/*
- * nfsd_buffered_readdir drops the i_mutex between
- * readdir and calling this callback, leaving a window
- * where this directory entry could have gone away.
+ * we're not holding the i_mutex here, so there's
+ * a window where this directory entry could have gone
+ * away.
*/
dput(dentry);
return nfserr_noent;
@@ -3040,7 +3063,7 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
NFS4_MAX_SESSIONID_LEN);
*p++ = cpu_to_be32(bcts->dir);
- /* Sorry, we do not yet support RDMA over 4.1: */
+ /* Upshifting from TCP to RDMA is not supported */
*p++ = cpu_to_be32(0);
}
return nfserr;
@@ -3342,6 +3365,7 @@ static __be32 nfsd4_encode_splice_read(
struct xdr_stream *xdr = &resp->xdr;
struct xdr_buf *buf = xdr->buf;
u32 eof;
+ long len;
int space_left;
__be32 nfserr;
__be32 *p = xdr->p - 2;
@@ -3350,6 +3374,7 @@ static __be32 nfsd4_encode_splice_read(
if (xdr->end - xdr->p < 1)
return nfserr_resource;
+ len = maxcount;
nfserr = nfsd_splice_read(read->rd_rqstp, file,
read->rd_offset, &maxcount);
if (nfserr) {
@@ -3362,8 +3387,8 @@ static __be32 nfsd4_encode_splice_read(
return nfserr;
}
- eof = (read->rd_offset + maxcount >=
- d_inode(read->rd_fhp->fh_dentry)->i_size);
+ eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
+ d_inode(read->rd_fhp->fh_dentry)->i_size);
*(p++) = htonl(eof);
*(p++) = htonl(maxcount);
@@ -3433,14 +3458,15 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
}
read->rd_vlen = v;
+ len = maxcount;
nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
read->rd_vlen, &maxcount);
if (nfserr)
return nfserr;
xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
- eof = (read->rd_offset + maxcount >=
- d_inode(read->rd_fhp->fh_dentry)->i_size);
+ eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
+ d_inode(read->rd_fhp->fh_dentry)->i_size);
tmp = htonl(eof);
write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
@@ -4292,6 +4318,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
[OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop,
};
/*
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 46ec934f5dee..54cde9a5864e 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -63,7 +63,6 @@ static unsigned int longest_chain;
static unsigned int longest_chain_cachesize;
static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
-static void cache_cleaner_func(struct work_struct *unused);
static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
@@ -76,13 +75,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
};
/*
- * locking for the reply cache:
- * A cache entry is "single use" if c_state == RC_INPROG
- * Otherwise, it when accessing _prev or _next, the lock must be held.
- */
-static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
-
-/*
* Put a cap on the size of the DRC based on the amount of available
* low memory in the machine.
*
@@ -203,7 +195,6 @@ void nfsd_reply_cache_shutdown(void)
unsigned int i;
unregister_shrinker(&nfsd_reply_cache_shrinker);
- cancel_delayed_work_sync(&cache_cleaner);
for (i = 0; i < drc_hashsize; i++) {
struct list_head *head = &drc_hashtbl[i].lru_head;
@@ -217,10 +208,8 @@ void nfsd_reply_cache_shutdown(void)
drc_hashtbl = NULL;
drc_hashsize = 0;
- if (drc_slab) {
- kmem_cache_destroy(drc_slab);
- drc_slab = NULL;
- }
+ kmem_cache_destroy(drc_slab);
+ drc_slab = NULL;
}
/*
@@ -232,7 +221,6 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
rp->c_timestamp = jiffies;
list_move_tail(&rp->c_lru, &b->lru_head);
- schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
}
static long
@@ -266,7 +254,6 @@ prune_cache_entries(void)
{
unsigned int i;
long freed = 0;
- bool cancel = true;
for (i = 0; i < drc_hashsize; i++) {
struct nfsd_drc_bucket *b = &drc_hashtbl[i];
@@ -275,26 +262,11 @@ prune_cache_entries(void)
continue;
spin_lock(&b->cache_lock);
freed += prune_bucket(b);
- if (!list_empty(&b->lru_head))
- cancel = false;
spin_unlock(&b->cache_lock);
}
-
- /*
- * Conditionally rearm the job to run in RC_EXPIRE since we just
- * ran the pruner.
- */
- if (!cancel)
- mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
return freed;
}
-static void
-cache_cleaner_func(struct work_struct *unused)
-{
- prune_cache_entries();
-}
-
static unsigned long
nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 350041a40fe5..c1681ce894c5 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -631,10 +631,7 @@ fh_put(struct svc_fh *fhp)
fh_unlock(fhp);
fhp->fh_dentry = NULL;
dput(dentry);
-#ifdef CONFIG_NFSD_V3
- fhp->fh_pre_saved = 0;
- fhp->fh_post_saved = 0;
-#endif
+ fh_clear_wcc(fhp);
}
fh_drop_write(fhp);
if (exp) {
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 1e90dad4926b..f84fe6bf9aee 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -7,6 +7,7 @@
#ifndef _LINUX_NFSD_NFSFH_H
#define _LINUX_NFSD_NFSFH_H
+#include <linux/crc32.h>
#include <linux/sunrpc/svc.h>
#include <uapi/linux/nfsd/nfsfh.h>
@@ -26,16 +27,16 @@ static inline ino_t u32_to_ino_t(__u32 uino)
*/
typedef struct svc_fh {
struct knfsd_fh fh_handle; /* FH data */
+ int fh_maxsize; /* max size for fh_handle */
struct dentry * fh_dentry; /* validated dentry */
struct svc_export * fh_export; /* export pointer */
- int fh_maxsize; /* max size for fh_handle */
- unsigned char fh_locked; /* inode locked by us */
- unsigned char fh_want_write; /* remount protection taken */
+ bool fh_locked; /* inode locked by us */
+ bool fh_want_write; /* remount protection taken */
#ifdef CONFIG_NFSD_V3
- unsigned char fh_post_saved; /* post-op attrs saved */
- unsigned char fh_pre_saved; /* pre-op attrs saved */
+ bool fh_post_saved; /* post-op attrs saved */
+ bool fh_pre_saved; /* pre-op attrs saved */
/* Pre-op attributes saved during fh_lock */
__u64 fh_pre_size; /* size before operation */
@@ -205,6 +206,28 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
return true;
}
+#ifdef CONFIG_CRC32
+/**
+ * knfsd_fh_hash - calculate the crc32 hash for the filehandle
+ * @fh - pointer to filehandle
+ *
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+ return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
+}
+#else
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_NFSD_V3
/*
* The wcc data stored in current_fh should be cleared
@@ -213,8 +236,8 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
static inline void
fh_clear_wcc(struct svc_fh *fhp)
{
- fhp->fh_post_saved = 0;
- fhp->fh_pre_saved = 0;
+ fhp->fh_post_saved = false;
+ fhp->fh_pre_saved = false;
}
/*
@@ -231,7 +254,7 @@ fill_pre_wcc(struct svc_fh *fhp)
fhp->fh_pre_ctime = inode->i_ctime;
fhp->fh_pre_size = inode->i_size;
fhp->fh_pre_change = inode->i_version;
- fhp->fh_pre_saved = 1;
+ fhp->fh_pre_saved = true;
}
}
@@ -265,9 +288,9 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
}
inode = d_inode(dentry);
- mutex_lock_nested(&inode->i_mutex, subclass);
+ inode_lock_nested(inode, subclass);
fill_pre_wcc(fhp);
- fhp->fh_locked = 1;
+ fhp->fh_locked = true;
}
static inline void
@@ -284,8 +307,8 @@ fh_unlock(struct svc_fh *fhp)
{
if (fhp->fh_locked) {
fill_post_wcc(fhp);
- mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
- fhp->fh_locked = 0;
+ inode_unlock(d_inode(fhp->fh_dentry));
+ fhp->fh_locked = false;
}
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ad4e2377dd63..45007acaf364 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -14,9 +14,13 @@
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/bind.h>
#include <linux/nfsacl.h>
#include <linux/seq_file.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
#include <net/net_namespace.h>
#include "nfsd.h"
#include "cache.h"
@@ -306,22 +310,81 @@ static void nfsd_shutdown_net(struct net *net)
nfsd_shutdown_generic();
}
+static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct net_device *dev = ifa->ifa_dev->dev;
+ struct net *net = dev_net(dev);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in sin;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inetaddr_notifier = {
+ .notifier_call = nfsd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int nfsd_inet6addr_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct net_device *dev = ifa->idev->dev;
+ struct net *net = dev_net(dev);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in6 sin6;
+
+ if (event != NETDEV_DOWN)
+ goto out;
+
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ifa->addr;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
+ }
+
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inet6addr_notifier = {
+ .notifier_call = nfsd_inet6addr_event,
+};
+#endif
+
static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
/*
* write_ports can create the server without actually starting
* any threads--if we get shut down before any threads are
* started, then nfsd_last_thread will be run before any of this
- * other initialization has been done.
+ * other initialization has been done except the rpcb information.
*/
+ svc_rpcb_cleanup(serv, net);
if (!nn->nfsd_net_up)
return;
- nfsd_shutdown_net(net);
-
- svc_rpcb_cleanup(serv, net);
+ nfsd_shutdown_net(net);
printk(KERN_WARNING "nfsd: last server has exited, flushing export "
"cache\n");
nfsd_export_flush(net);
@@ -425,6 +488,10 @@ int nfsd_create_serv(struct net *net)
}
set_max_drc();
+ register_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+ register_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
do_gettimeofday(&nn->nfssvc_boot); /* record boot time */
return 0;
}
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index d4c4453674c6..7d073b9b1553 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
u32 notify_types;
__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+ struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdevp);
__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
struct nfsd4_getdeviceinfo *gdevp);
@@ -32,10 +33,17 @@ struct nfsd4_layout_ops {
__be32 (*proc_layoutcommit)(struct inode *inode,
struct nfsd4_layoutcommit *lcp);
+
+ void (*fence_client)(struct nfs4_layout_stateid *ls);
};
extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
extern const struct nfsd4_layout_ops bl_layout_ops;
+#endif
+#ifdef CONFIG_NFSD_SCSILAYOUT
+extern const struct nfsd4_layout_ops scsi_layout_ops;
+#endif
__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate, stateid_t *stateid,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 583ffc13cae2..c050c53036a6 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -65,7 +65,7 @@ struct nfsd4_callback {
struct nfs4_client *cb_clp;
u32 cb_minorversion;
struct rpc_message cb_msg;
- struct nfsd4_callback_ops *cb_ops;
+ const struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
int cb_seq_status;
int cb_status;
@@ -84,7 +84,7 @@ struct nfsd4_callback_ops {
* fields that are of general use to any stateid.
*/
struct nfs4_stid {
- atomic_t sc_count;
+ atomic_t sc_count;
#define NFS4_OPEN_STID 1
#define NFS4_LOCK_STID 2
#define NFS4_DELEG_STID 4
@@ -94,11 +94,12 @@ struct nfs4_stid {
#define NFS4_REVOKED_DELEG_STID 16
#define NFS4_CLOSED_DELEG_STID 32
#define NFS4_LAYOUT_STID 64
- unsigned char sc_type;
- stateid_t sc_stateid;
- struct nfs4_client *sc_client;
- struct nfs4_file *sc_file;
- void (*sc_free)(struct nfs4_stid *);
+ unsigned char sc_type;
+ stateid_t sc_stateid;
+ spinlock_t sc_lock;
+ struct nfs4_client *sc_client;
+ struct nfs4_file *sc_file;
+ void (*sc_free)(struct nfs4_stid *);
};
/*
@@ -364,15 +365,6 @@ struct nfs4_client_reclaim {
char cr_recdir[HEXDIR_LEN]; /* recover dir */
};
-static inline void
-update_stateid(stateid_t *stateid)
-{
- stateid->si_generation++;
- /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
- if (stateid->si_generation == 0)
- stateid->si_generation = 1;
-}
-
/* A reasonable value for REPLAY_ISIZE was estimated as follows:
* The OPEN response, typically the largest, requires
* 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) +
@@ -534,15 +526,16 @@ struct nfs4_file {
* Better suggestions welcome.
*/
struct nfs4_ol_stateid {
- struct nfs4_stid st_stid; /* must be first field */
- struct list_head st_perfile;
- struct list_head st_perstateowner;
- struct list_head st_locks;
- struct nfs4_stateowner * st_stateowner;
- struct nfs4_clnt_odstate * st_clnt_odstate;
- unsigned char st_access_bmap;
- unsigned char st_deny_bmap;
- struct nfs4_ol_stateid * st_openstp;
+ struct nfs4_stid st_stid;
+ struct list_head st_perfile;
+ struct list_head st_perstateowner;
+ struct list_head st_locks;
+ struct nfs4_stateowner *st_stateowner;
+ struct nfs4_clnt_odstate *st_clnt_odstate;
+ unsigned char st_access_bmap;
+ unsigned char st_deny_bmap;
+ struct nfs4_ol_stateid *st_openstp;
+ struct rw_semaphore st_rwsem;
};
static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
@@ -561,6 +554,7 @@ struct nfs4_layout_stateid {
struct nfsd4_callback ls_recall;
stateid_t ls_recall_sid;
bool ls_recalled;
+ struct mutex ls_mutex;
};
static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
@@ -584,8 +578,8 @@ struct nfsd4_compound_state;
struct nfsd_net;
extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
- struct nfsd4_compound_state *cstate, stateid_t *stateid,
- int flags, struct file **filp, bool *tmp_file);
+ struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
+ stateid_t *stateid, int flags, struct file **filp, bool *tmp_file);
__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
stateid_t *stateid, unsigned char typemask,
struct nfs4_stid **s, struct nfsd_net *nn);
@@ -593,6 +587,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
struct kmem_cache *slab);
void nfs4_unhash_stid(struct nfs4_stid *s);
void nfs4_put_stid(struct nfs4_stid *s);
+void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
extern void nfs4_release_reclaim(struct nfsd_net *);
extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
@@ -604,7 +599,7 @@ extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
- struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
extern void nfsd4_run_cb(struct nfsd4_callback *cb);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
index 82f89070594c..90967466a1e5 100644
--- a/fs/nfsd/trace.c
+++ b/fs/nfsd/trace.c
@@ -1,5 +1,3 @@
-#include "state.h"
-
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index c668520c344b..3287041905da 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -8,6 +8,49 @@
#define _NFSD_TRACE_H
#include <linux/tracepoint.h>
+#include "nfsfh.h"
+
+DECLARE_EVENT_CLASS(nfsd_io_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+ loff_t offset,
+ int len),
+ TP_ARGS(rqstp, fhp, offset, len),
+ TP_STRUCT__entry(
+ __field(__be32, xid)
+ __field_struct(struct knfsd_fh, fh)
+ __field(loff_t, offset)
+ __field(int, len)
+ ),
+ TP_fast_assign(
+ __entry->xid = rqstp->rq_xid,
+ fh_copy_shallow(&__entry->fh, &fhp->fh_handle);
+ __entry->offset = offset;
+ __entry->len = len;
+ ),
+ TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d",
+ __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh),
+ __entry->offset, __entry->len)
+)
+
+#define DEFINE_NFSD_IO_EVENT(name) \
+DEFINE_EVENT(nfsd_io_class, name, \
+ TP_PROTO(struct svc_rqst *rqstp, \
+ struct svc_fh *fhp, \
+ loff_t offset, \
+ int len), \
+ TP_ARGS(rqstp, fhp, offset, len))
+
+DEFINE_NFSD_IO_EVENT(read_start);
+DEFINE_NFSD_IO_EVENT(read_opened);
+DEFINE_NFSD_IO_EVENT(read_io_done);
+DEFINE_NFSD_IO_EVENT(read_done);
+DEFINE_NFSD_IO_EVENT(write_start);
+DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_io_done);
+DEFINE_NFSD_IO_EVENT(write_done);
+
+#include "state.h"
DECLARE_EVENT_CLASS(nfsd_stateid_class,
TP_PROTO(stateid_t *stp),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 45c04979e7b3..d40010e4f1a9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -36,12 +36,14 @@
#endif /* CONFIG_NFSD_V3 */
#ifdef CONFIG_NFSD_V4
+#include "../internal.h"
#include "acl.h"
#include "idmap.h"
#endif /* CONFIG_NFSD_V4 */
#include "nfsd.h"
#include "vfs.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
@@ -217,10 +219,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
- /*
- * check if we have crossed a mount point ...
- */
if (nfsd_mountpoint(dentry, exp)) {
+ /*
+ * We don't need the i_mutex after all. It's
+ * still possible we could open this (regular
+ * files can be mountpoints too), but the
+ * i_mutex is just there to prevent renames of
+ * something that we might be about to delegate,
+ * and a mountpoint won't be renamed:
+ */
+ fh_unlock(fhp);
if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
dput(dentry);
goto out_nfserr;
@@ -485,9 +493,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
dentry = fhp->fh_dentry;
- mutex_lock(&d_inode(dentry)->i_mutex);
+ inode_lock(d_inode(dentry));
host_error = security_inode_setsecctx(dentry, label->data, label->len);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
return nfserrno(host_error);
}
#else
@@ -498,6 +506,13 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
#endif
+__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
+ u64 dst_pos, u64 count)
+{
+ return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
+ count));
+}
+
__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset, loff_t len,
int flags)
@@ -855,7 +870,7 @@ __be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
oldfs = get_fs();
set_fs(KERNEL_DS);
- host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+ host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset, 0);
set_fs(oldfs);
return nfsd_finish_read(file, count, host_err);
}
@@ -942,7 +957,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
/* Write the data. */
oldfs = get_fs(); set_fs(KERNEL_DS);
- host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos);
+ host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos, 0);
set_fs(oldfs);
if (host_err < 0)
goto out_nfserr;
@@ -983,16 +998,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct raparms *ra;
__be32 err;
+ trace_read_start(rqstp, fhp, offset, vlen);
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
if (err)
return err;
ra = nfsd_init_raparms(file);
+
+ trace_read_opened(rqstp, fhp, offset, vlen);
err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
+ trace_read_io_done(rqstp, fhp, offset, vlen);
+
if (ra)
nfsd_put_raparams(file, ra);
fput(file);
+ trace_read_done(rqstp, fhp, offset, vlen);
+
return err;
}
@@ -1008,24 +1030,31 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
{
__be32 err = 0;
+ trace_write_start(rqstp, fhp, offset, vlen);
+
if (file) {
err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
+ trace_write_opened(rqstp, fhp, offset, vlen);
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
stablep);
+ trace_write_io_done(rqstp, fhp, offset, vlen);
} else {
err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
if (err)
goto out;
+ trace_write_opened(rqstp, fhp, offset, vlen);
if (cnt)
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
cnt, stablep);
+ trace_write_io_done(rqstp, fhp, offset, vlen);
fput(file);
}
out:
+ trace_write_done(rqstp, fhp, offset, vlen);
return err;
}
@@ -1631,7 +1660,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
/* cannot use fh_lock as we need deadlock protective ordering
* so do it by hand */
trap = lock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = 1;
+ ffhp->fh_locked = tfhp->fh_locked = true;
fill_pre_wcc(ffhp);
fill_pre_wcc(tfhp);
@@ -1681,7 +1710,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
fill_post_wcc(ffhp);
fill_post_wcc(tfhp);
unlock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = 0;
+ ffhp->fh_locked = tfhp->fh_locked = false;
fh_drop_write(ffhp);
out:
@@ -1809,7 +1838,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
offset = *offsetp;
while (1) {
- struct inode *dir_inode = file_inode(file);
unsigned int reclen;
cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1856,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
if (!size)
break;
- /*
- * Various filldir functions may end up calling back into
- * lookup_one_len() and the file system's ->lookup() method.
- * These expect i_mutex to be held, as it would within readdir.
- */
- host_err = mutex_lock_killable(&dir_inode->i_mutex);
- if (host_err)
- break;
-
de = (struct buffered_dirent *)buf.dirent;
while (size > 0) {
offset = de->offset;
@@ -1853,7 +1872,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
size -= reclen;
de = (struct buffered_dirent *)((char *)de + reclen);
}
- mutex_unlock(&dir_inode->i_mutex);
if (size > 0) /* We bailed out early */
break;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index fee2451ae248..2d573ec057f8 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -56,6 +56,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
struct xdr_netobj *);
__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
struct file *, loff_t, loff_t, int);
+__be32 nfsd4_clone_file_range(struct file *, u64, struct file *,
+ u64, u64);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
@@ -112,14 +114,14 @@ static inline int fh_want_write(struct svc_fh *fh)
int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
if (!ret)
- fh->fh_want_write = 1;
+ fh->fh_want_write = true;
return ret;
}
static inline void fh_drop_write(struct svc_fh *fh)
{
if (fh->fh_want_write) {
- fh->fh_want_write = 0;
+ fh->fh_want_write = false;
mnt_drop_write(fh->fh_export->ex_path.mnt);
}
}
@@ -137,4 +139,23 @@ static inline int nfsd_create_is_exclusive(int createmode)
|| createmode == NFS4_CREATE_EXCLUSIVE4_1;
}
+static inline bool nfsd_eof_on_read(long requested, long read,
+ loff_t offset, loff_t size)
+{
+ /* We assume a short read means eof: */
+ if (requested > read)
+ return true;
+ /*
+ * A non-short read might also reach end of file. The spec
+ * still requires us to set eof in that case.
+ *
+ * Further operations may have modified the file size since
+ * the read, so the following check is not atomic with the read.
+ * We've only seen that cause a problem for a client in the case
+ * where the read returned a count of 0 without setting eof.
+ * That case was fixed by the addition of the above check.
+ */
+ return (offset + read >= size);
+}
+
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 9f991007a578..d9554813e58a 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -491,6 +491,15 @@ struct nfsd4_fallocate {
u64 falloc_length;
};
+struct nfsd4_clone {
+ /* request */
+ stateid_t cl_src_stateid;
+ stateid_t cl_dst_stateid;
+ u64 cl_src_pos;
+ u64 cl_dst_pos;
+ u64 cl_count;
+};
+
struct nfsd4_seek {
/* request */
stateid_t seek_stateid;
@@ -555,6 +564,7 @@ struct nfsd4_op {
/* NFSv4.2 */
struct nfsd4_fallocate allocate;
struct nfsd4_fallocate deallocate;
+ struct nfsd4_clone clone;
struct nfsd4_seek seek;
} u;
struct nfs4_replay * replay;
@@ -632,7 +642,7 @@ static inline void
set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
{
BUG_ON(!fhp->fh_pre_saved);
- cinfo->atomic = fhp->fh_post_saved;
+ cinfo->atomic = (u32)fhp->fh_post_saved;
cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
cinfo->before_change = fhp->fh_pre_change;
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 8df0f3b7839b..2ccbf5531554 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -133,38 +133,38 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
/**
* nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
* @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
*/
static unsigned long
-nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
- const struct nilfs_palloc_group_desc *desc)
+nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
+ spinlock_t *lock)
{
unsigned long nfree;
- spin_lock(nilfs_mdt_bgl_lock(inode, group));
+ spin_lock(lock);
nfree = le32_to_cpu(desc->pg_nfrees);
- spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+ spin_unlock(lock);
return nfree;
}
/**
* nilfs_palloc_group_desc_add_entries - adjust count of free entries
- * @inode: inode of metadata file using this allocator
- * @group: group number
* @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
* @n: delta to be added
*/
-static void
-nilfs_palloc_group_desc_add_entries(struct inode *inode,
- unsigned long group,
- struct nilfs_palloc_group_desc *desc,
- u32 n)
+static u32
+nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
+ spinlock_t *lock, u32 n)
{
- spin_lock(nilfs_mdt_bgl_lock(inode, group));
+ u32 nfree;
+
+ spin_lock(lock);
le32_add_cpu(&desc->pg_nfrees, n);
- spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+ nfree = le32_to_cpu(desc->pg_nfrees);
+ spin_unlock(lock);
+ return nfree;
}
/**
@@ -240,6 +240,26 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
}
/**
+ * nilfs_palloc_delete_block - delete a block on the persistent allocator file
+ * @inode: inode of metadata file using this allocator
+ * @blkoff: block offset
+ * @prev: nilfs_bh_assoc struct of the last used buffer
+ * @lock: spin lock protecting @prev
+ */
+static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
+ struct nilfs_bh_assoc *prev,
+ spinlock_t *lock)
+{
+ spin_lock(lock);
+ if (prev->bh && blkoff == prev->blkoff) {
+ brelse(prev->bh);
+ prev->bh = NULL;
+ }
+ spin_unlock(lock);
+ return nilfs_mdt_delete_block(inode, blkoff);
+}
+
+/**
* nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
* @inode: inode of metadata file using this allocator
* @group: group number
@@ -278,6 +298,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
}
/**
+ * nilfs_palloc_delete_bitmap_block - delete a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ */
+static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
+ unsigned long group)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_delete_block(inode,
+ nilfs_palloc_bitmap_blkoff(inode,
+ group),
+ &cache->prev_bitmap, &cache->lock);
+}
+
+/**
* nilfs_palloc_get_entry_block - get buffer head of an entry block
* @inode: inode of metadata file using this allocator
* @nr: serial number of the entry (e.g. inode number)
@@ -296,6 +332,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
}
/**
+ * nilfs_palloc_delete_entry_block - delete an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry
+ */
+static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_delete_block(inode,
+ nilfs_palloc_entry_blkoff(inode, nr),
+ &cache->prev_entry, &cache->lock);
+}
+
+/**
* nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
* @inode: inode of metadata file using this allocator
* @group: group number
@@ -332,51 +382,40 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
/**
* nilfs_palloc_find_available_slot - find available slot in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @target: offset number of an entry in the group (start point)
* @bitmap: bitmap of the group
+ * @target: offset number of an entry in the group (start point)
* @bsize: size in bits
+ * @lock: spin lock protecting @bitmap
*/
-static int nilfs_palloc_find_available_slot(struct inode *inode,
- unsigned long group,
+static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
- unsigned char *bitmap,
- int bsize)
-{
- int curr, pos, end, i;
-
- if (target > 0) {
- end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
- if (end > bsize)
- end = bsize;
- pos = nilfs_find_next_zero_bit(bitmap, end, target);
- if (pos < end &&
- !nilfs_set_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
- return pos;
- } else
- end = 0;
-
- for (i = 0, curr = end;
- i < bsize;
- i += BITS_PER_LONG, curr += BITS_PER_LONG) {
- /* wrap around */
- if (curr >= bsize)
- curr = 0;
- while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
- != ~0UL) {
- end = curr + BITS_PER_LONG;
- if (end > bsize)
- end = bsize;
- pos = nilfs_find_next_zero_bit(bitmap, end, curr);
- if ((pos < end) &&
- !nilfs_set_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group), pos,
- bitmap))
+ unsigned bsize,
+ spinlock_t *lock)
+{
+ int pos, end = bsize;
+
+ if (likely(target < bsize)) {
+ pos = target;
+ do {
+ pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+ if (pos >= end)
+ break;
+ if (!nilfs_set_bit_atomic(lock, pos, bitmap))
return pos;
- }
+ } while (++pos < end);
+
+ end = target;
+ }
+
+ /* wrap around */
+ for (pos = 0; pos < end; pos++) {
+ pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+ if (pos >= end)
+ break;
+ if (!nilfs_set_bit_atomic(lock, pos, bitmap))
+ return pos;
}
+
return -ENOSPC;
}
@@ -475,15 +514,15 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, maxgroup, ngroups;
unsigned long group_offset, maxgroup_offset;
- unsigned long n, entries_per_group, groups_per_desc_block;
+ unsigned long n, entries_per_group;
unsigned long i, j;
+ spinlock_t *lock;
int pos, ret;
ngroups = nilfs_palloc_groups_count(inode);
maxgroup = ngroups - 1;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
entries_per_group = nilfs_palloc_entries_per_group(inode);
- groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
for (i = 0; i < ngroups; i += n) {
if (group >= ngroups) {
@@ -501,8 +540,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
maxgroup);
for (j = 0; j < n; j++, desc++, group++) {
- if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
- > 0) {
+ lock = nilfs_mdt_bgl_lock(inode, group);
+ if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
ret = nilfs_palloc_get_bitmap_block(
inode, group, 1, &bitmap_bh);
if (ret < 0)
@@ -510,12 +549,12 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
- inode, group, group_offset, bitmap,
- entries_per_group);
+ bitmap, group_offset,
+ entries_per_group, lock);
if (pos >= 0) {
/* found a free entry */
nilfs_palloc_group_desc_add_entries(
- inode, group, desc, -1);
+ desc, lock, -1);
req->pr_entry_nr =
entries_per_group * group + pos;
kunmap(desc_bh->b_page);
@@ -573,6 +612,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
unsigned long group, group_offset;
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
+ spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -580,13 +620,15 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+ lock = nilfs_mdt_bgl_lock(inode, group);
- if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap))
- printk(KERN_WARNING "%s: entry number %llu already freed\n",
- __func__, (unsigned long long)req->pr_entry_nr);
+ if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
- nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+ nilfs_palloc_group_desc_add_entries(desc, lock, 1);
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
@@ -611,6 +653,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
void *desc_kaddr, *bitmap_kaddr;
unsigned char *bitmap;
unsigned long group, group_offset;
+ spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -618,12 +661,15 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
- if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap))
- printk(KERN_WARNING "%s: entry number %llu already freed\n",
- __func__, (unsigned long long)req->pr_entry_nr);
+ lock = nilfs_mdt_bgl_lock(inode, group);
+
+ if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
- nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+ nilfs_palloc_group_desc_add_entries(desc, lock, 1);
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
@@ -680,22 +726,6 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
}
/**
- * nilfs_palloc_group_is_in - judge if an entry is in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @nr: serial number of the entry (e.g. inode number)
- */
-static int
-nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
-{
- __u64 first, last;
-
- first = group * nilfs_palloc_entries_per_group(inode);
- last = first + nilfs_palloc_entries_per_group(inode) - 1;
- return (nr >= first) && (nr <= last);
-}
-
-/**
* nilfs_palloc_freev - deallocate a set of persistent objects
* @inode: inode of metadata file using this allocator
* @entry_nrs: array of entry numbers to be deallocated
@@ -708,9 +738,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, group_offset;
- int i, j, n, ret;
+ __u64 group_min_nr, last_nrs[8];
+ const unsigned long epg = nilfs_palloc_entries_per_group(inode);
+ const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
+ unsigned entry_start, end, pos;
+ spinlock_t *lock;
+ int i, j, k, ret;
+ u32 nfree;
for (i = 0; i < nitems; i = j) {
+ int change_group = false;
+ int nempties = 0, n = 0;
+
group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
if (ret < 0)
@@ -721,38 +760,89 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
brelse(desc_bh);
return ret;
}
- desc_kaddr = kmap(desc_bh->b_page);
- desc = nilfs_palloc_block_get_group_desc(
- inode, group, desc_bh, desc_kaddr);
+
+ /* Get the first entry number of the group */
+ group_min_nr = (__u64)group * epg;
+
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
- for (j = i, n = 0;
- (j < nitems) && nilfs_palloc_group_is_in(inode, group,
- entry_nrs[j]);
- j++) {
- nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
- if (!nilfs_clear_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap)) {
- printk(KERN_WARNING
- "%s: entry number %llu already freed\n",
- __func__,
- (unsigned long long)entry_nrs[j]);
+ lock = nilfs_mdt_bgl_lock(inode, group);
+
+ j = i;
+ entry_start = rounddown(group_offset, epb);
+ do {
+ if (!nilfs_clear_bit_atomic(lock, group_offset,
+ bitmap)) {
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)entry_nrs[j],
+ (unsigned long)inode->i_ino);
} else {
n++;
}
- }
- nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+
+ j++;
+ if (j >= nitems || entry_nrs[j] < group_min_nr ||
+ entry_nrs[j] >= group_min_nr + epg) {
+ change_group = true;
+ } else {
+ group_offset = entry_nrs[j] - group_min_nr;
+ if (group_offset >= entry_start &&
+ group_offset < entry_start + epb) {
+ /* This entry is in the same block */
+ continue;
+ }
+ }
+
+ /* Test if the entry block is empty or not */
+ end = entry_start + epb;
+ pos = nilfs_find_next_bit(bitmap, end, entry_start);
+ if (pos >= end) {
+ last_nrs[nempties++] = entry_nrs[j - 1];
+ if (nempties >= ARRAY_SIZE(last_nrs))
+ break;
+ }
+
+ if (change_group)
+ break;
+
+ /* Go on to the next entry block */
+ entry_start = rounddown(group_offset, epb);
+ } while (true);
kunmap(bitmap_bh->b_page);
- kunmap(desc_bh->b_page);
+ mark_buffer_dirty(bitmap_bh);
+ brelse(bitmap_bh);
+ for (k = 0; k < nempties; k++) {
+ ret = nilfs_palloc_delete_entry_block(inode,
+ last_nrs[k]);
+ if (ret && ret != -ENOENT) {
+ nilfs_warning(inode->i_sb, __func__,
+ "failed to delete block of entry %llu: ino=%lu, err=%d\n",
+ (unsigned long long)last_nrs[k],
+ (unsigned long)inode->i_ino, ret);
+ }
+ }
+
+ desc_kaddr = kmap_atomic(desc_bh->b_page);
+ desc = nilfs_palloc_block_get_group_desc(
+ inode, group, desc_bh, desc_kaddr);
+ nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
+ kunmap_atomic(desc_kaddr);
mark_buffer_dirty(desc_bh);
- mark_buffer_dirty(bitmap_bh);
nilfs_mdt_mark_dirty(inode);
-
- brelse(bitmap_bh);
brelse(desc_bh);
+
+ if (nfree == nilfs_palloc_entries_per_group(inode)) {
+ ret = nilfs_palloc_delete_bitmap_block(inode, group);
+ if (ret && ret != -ENOENT) {
+ nilfs_warning(inode->i_sb, __func__,
+ "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n",
+ group,
+ (unsigned long)inode->i_ino, ret);
+ }
+ }
}
return 0;
}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4bd6451b5703..6e6f49aa53df 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -77,6 +77,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
#define nilfs_set_bit_atomic ext2_set_bit_atomic
#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
#define nilfs_find_next_zero_bit find_next_zero_bit_le
+#define nilfs_find_next_bit find_next_bit_le
/**
* struct nilfs_bh_assoc - block offset and buffer head association
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 27f75bcbeb30..a9fb3636c142 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -458,7 +458,7 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
struct buffer_head *pbh;
__u64 key;
- key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
+ key = page_index(bh->b_page) << (PAGE_SHIFT -
bmap->b_inode->i_blkbits);
for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page)
key++;
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index a35ae35e6932..e0c9daf9aa22 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -62,7 +62,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
set_buffer_uptodate(bh);
unlock_page(bh->b_page);
- page_cache_release(bh->b_page);
+ put_page(bh->b_page);
return bh;
}
@@ -128,7 +128,7 @@ found:
out_locked:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -146,7 +146,7 @@ void nilfs_btnode_delete(struct buffer_head *bh)
pgoff_t index = page_index(page);
int still_dirty;
- page_cache_get(page);
+ get_page(page);
lock_page(page);
wait_on_page_writeback(page);
@@ -154,7 +154,7 @@ void nilfs_btnode_delete(struct buffer_head *bh)
still_dirty = PageDirty(page);
mapping = page->mapping;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (!still_dirty && mapping)
invalidate_inode_pages2_range(mapping, index, index);
@@ -181,7 +181,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
obh = ctxt->bh;
ctxt->newbh = NULL;
- if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
+ if (inode->i_blkbits == PAGE_SHIFT) {
lock_page(obh->b_page);
/*
* We cannot call radix_tree_preload for the kernels older
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 919fd5bb14a8..3a3821b00486 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -919,8 +919,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
int level, __u64 *keyp, __u64 *ptrp)
{
struct nilfs_btree_node *node, *right;
- __u64 newkey;
- __u64 newptr;
int nchildren, n, move, ncblk;
node = nilfs_btree_get_nonroot_node(path, level);
@@ -942,9 +940,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
if (!buffer_dirty(path[level].bp_sib_bh))
mark_buffer_dirty(path[level].bp_sib_bh);
- newkey = nilfs_btree_node_get_key(right, 0);
- newptr = path[level].bp_newreq.bpr_ptr;
-
if (move) {
path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
nilfs_btree_node_insert(right, path[level].bp_index,
@@ -1856,7 +1851,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
__u64 key, __u64 ptr,
const __u64 *keys, const __u64 *ptrs, int n)
{
- struct buffer_head *bh;
+ struct buffer_head *bh = NULL;
union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
struct nilfs_bmap_stats stats;
int ret;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 0d5fada91191..7dc23f100e57 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -155,7 +155,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
{
struct nilfs_dat_entry *entry;
- __u64 start;
sector_t blocknr;
void *kaddr;
int ret;
@@ -169,7 +168,6 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
- start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
kunmap_atomic(kaddr);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 6b8b92b19cec..e08f064e4bd7 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -58,7 +58,7 @@ static inline unsigned nilfs_chunk_size(struct inode *inode)
static inline void nilfs_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -69,9 +69,9 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
{
unsigned last_byte = inode->i_size;
- last_byte -= page_nr << PAGE_CACHE_SHIFT;
- if (last_byte > PAGE_CACHE_SIZE)
- last_byte = PAGE_CACHE_SIZE;
+ last_byte -= page_nr << PAGE_SHIFT;
+ if (last_byte > PAGE_SIZE)
+ last_byte = PAGE_SIZE;
return last_byte;
}
@@ -109,12 +109,12 @@ static void nilfs_check_page(struct page *page)
unsigned chunk_size = nilfs_chunk_size(dir);
char *kaddr = page_address(page);
unsigned offs, rec_len;
- unsigned limit = PAGE_CACHE_SIZE;
+ unsigned limit = PAGE_SIZE;
struct nilfs_dir_entry *p;
char *error;
- if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_CACHE_MASK;
+ if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+ limit = dir->i_size & ~PAGE_MASK;
if (limit & (chunk_size - 1))
goto Ebadsize;
if (!limit)
@@ -161,7 +161,7 @@ Espan:
bad_entry:
nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
(unsigned long) le64_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
@@ -170,7 +170,7 @@ Eend:
nilfs_error(sb, "nilfs_check_page",
"entry in directory #%lu spans the page boundary"
"offset=%lu, inode=%lu",
- dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
(unsigned long) le64_to_cpu(p->inode));
fail:
SetPageChecked(page);
@@ -256,8 +256,8 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
loff_t pos = ctx->pos;
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- unsigned int offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned int offset = pos & ~PAGE_MASK;
+ unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
@@ -272,7 +272,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
if (IS_ERR(page)) {
nilfs_error(sb, __func__, "bad page in #%lu",
inode->i_ino);
- ctx->pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_SIZE - offset;
return -EIO;
}
kaddr = page_address(page);
@@ -361,7 +361,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
if (++n >= npages)
n = 0;
/* next page is past the blocks we've got */
- if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
nilfs_error(dir->i_sb, __func__,
"dir %lu size %lld exceeds block count %llu",
dir->i_ino, dir->i_size,
@@ -401,7 +401,7 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
if (de) {
res = le64_to_cpu(de->inode);
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
return res;
}
@@ -460,7 +460,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
kaddr = page_address(page);
dir_end = kaddr + nilfs_last_byte(dir, n);
de = (struct nilfs_dir_entry *)kaddr;
- kaddr += PAGE_CACHE_SIZE - reclen;
+ kaddr += PAGE_SIZE - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
/* We hit i_size */
@@ -603,7 +603,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
kunmap_atomic(kaddr);
nilfs_commit_chunk(page, mapping, 0, chunk_size);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 54575e3cc1a2..088ba001c6ef 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -109,7 +109,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
file_update_time(vma->vm_file);
- ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
+ ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
if (ret) {
nilfs_transaction_abort(inode->i_sb);
goto out;
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 748ca238915a..0224b7826ace 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -115,7 +115,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
failed:
unlock_page(bh->b_page);
- page_cache_release(bh->b_page);
+ put_page(bh->b_page);
return err;
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 4a73d6dffabf..534631358b13 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -249,7 +249,7 @@ static int nilfs_set_page_dirty(struct page *page)
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
} else if (ret) {
- unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
nilfs_set_file_dirty(inode, nr_dirty);
}
@@ -291,7 +291,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned start = pos & (PAGE_SIZE - 1);
unsigned nr_dirty;
int err;
@@ -356,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
goto failed;
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
root = NILFS_I(dir)->i_root;
ii = NILFS_I(inode);
@@ -510,6 +510,7 @@ static int __nilfs_read_inode(struct super_block *sb,
inode->i_mapping->a_ops = &nilfs_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &nilfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
} else {
inode->i_op = &nilfs_special_inode_operations;
@@ -522,7 +523,7 @@ static int __nilfs_read_inode(struct super_block *sb,
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
nilfs_set_inode_flags(inode);
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
return 0;
failed_unmap:
@@ -1002,7 +1003,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
isize = i_size_read(inode);
@@ -1112,6 +1113,6 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret == 1)
ret = 0;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index aba43811d6ef..e8fe24882b5b 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -158,7 +158,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
flags = nilfs_mask_flags(inode->i_mode, flags);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
oldflags = NILFS_I(inode)->i_flags;
@@ -186,7 +186,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
nilfs_mark_inode_dirty(inode);
ret = nilfs_transaction_commit(inode->i_sb);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index dee34d990281..f6982b9153d5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -33,6 +33,7 @@
#include "page.h"
#include "mdt.h"
+#include <trace/events/nilfs2.h>
#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
@@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(inode);
+
+ trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block);
+
return 0;
}
@@ -106,7 +110,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
failed_bh:
unlock_page(bh->b_page);
- page_cache_release(bh->b_page);
+ put_page(bh->b_page);
brelse(bh);
failed_unlock:
@@ -158,13 +162,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
get_bh(bh);
submit_bh(mode, bh);
ret = 0;
+
+ trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
out:
get_bh(bh);
*out_bh = bh;
failed_bh:
unlock_page(bh->b_page);
- page_cache_release(bh->b_page);
+ put_page(bh->b_page);
brelse(bh);
failed:
return ret;
@@ -357,7 +363,7 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
{
pgoff_t index = (pgoff_t)block >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
struct page *page;
unsigned long first_block;
int ret = 0;
@@ -370,7 +376,7 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
wait_on_page_writeback(page);
first_block = (unsigned long)index <<
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ (PAGE_SHIFT - inode->i_blkbits);
if (page_has_buffers(page)) {
struct buffer_head *bh;
@@ -379,7 +385,7 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
}
still_dirty = PageDirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (still_dirty ||
invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
@@ -572,7 +578,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return 0;
}
@@ -591,7 +597,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
bh_frozen = nilfs_page_get_nth_block(page, n);
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return bh_frozen;
}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index fe529a87a208..03246cac3338 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -72,7 +72,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
}
/* Default GFP flags using highmem */
-#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)
int nilfs_mdt_get_block(struct inode *, unsigned long, int,
void (*init_block)(struct inode *,
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 37dd6b05b1b5..151bc19d47c0 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -120,9 +120,6 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
struct nilfs_transaction_info ti;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
if (err)
return err;
@@ -164,6 +161,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
/* slow symlink */
inode->i_op = &nilfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
err = page_symlink(inode, symname, l);
if (err)
@@ -433,11 +431,11 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
nilfs_transaction_abort(old_dir->i_sb);
return err;
@@ -571,8 +569,7 @@ const struct inode_operations nilfs_special_inode_operations = {
const struct inode_operations nilfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.permission = nilfs_permission,
};
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 45d650addd56..489391561cda 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -50,7 +50,7 @@ __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << blkbits, b_state);
- first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
+ first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
bh = nilfs_page_get_nth_block(page, block - first_block);
touch_buffer(bh);
@@ -64,7 +64,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
unsigned long b_state)
{
int blkbits = inode->i_blkbits;
- pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
+ pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits);
struct page *page;
struct buffer_head *bh;
@@ -75,7 +75,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
if (unlikely(!bh)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return NULL;
}
return bh;
@@ -180,7 +180,7 @@ void nilfs_page_bug(struct page *page)
printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
"mapping=%p ino=%lu\n",
- page, atomic_read(&page->_count),
+ page, page_ref_count(page),
(unsigned long long)page->index, page->flags, m, ino);
if (page_has_buffers(page)) {
@@ -288,7 +288,7 @@ repeat:
__set_page_dirty_nobuffers(dpage);
unlock_page(dpage);
- page_cache_release(dpage);
+ put_page(dpage);
unlock_page(page);
}
pagevec_release(&pvec);
@@ -333,7 +333,7 @@ repeat:
WARN_ON(PageDirty(dpage));
nilfs_copy_page(dpage, page, 0);
unlock_page(dpage);
- page_cache_release(dpage);
+ put_page(dpage);
} else {
struct page *page2;
@@ -350,7 +350,7 @@ repeat:
if (unlikely(err < 0)) {
WARN_ON(err == -EEXIST);
page->mapping = NULL;
- page_cache_release(page); /* for cache */
+ put_page(page); /* for cache */
} else {
page->mapping = dmap;
dmap->nrpages++;
@@ -523,8 +523,8 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
if (inode->i_mapping->nrpages == 0)
return 0;
- index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
- nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);
+ nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits);
pagevec_init(&pvec, 0);
@@ -537,7 +537,7 @@ repeat:
if (length > 0 && pvec.pages[0]->index > index)
goto out;
- b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits);
i = 0;
do {
page = pvec.pages[i];
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ff00a0b7acb9..5afa77fadc11 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -544,14 +544,14 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
blocksize, page, NULL);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
(*nr_salvaged_blocks)++;
goto next;
failed_page:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
failed_inode:
printk(KERN_WARNING
@@ -582,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
struct nilfs_recovery_info *ri)
{
struct buffer_head *bh_sum = NULL;
- struct nilfs_segment_summary *sum;
+ struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start;
sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
unsigned long nsalvaged_blocks = 0;
@@ -814,7 +814,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
struct nilfs_recovery_info *ri)
{
struct buffer_head *bh_sum = NULL;
- struct nilfs_segment_summary *sum;
+ struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start, pseg_end, sr_pseg_start = 0;
sector_t seg_start, seg_end; /* range of full segment (block number) */
sector_t b, end;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c6abbad9b8e3..4317f72568e6 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -77,6 +77,36 @@ enum {
NILFS_ST_DONE,
};
+#define CREATE_TRACE_POINTS
+#include <trace/events/nilfs2.h>
+
+/*
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are
+ * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of
+ * the variable must use them because transition of stage count must involve
+ * trace events (trace_nilfs2_collection_stage_transition).
+ *
+ * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't
+ * produce tracepoint events. It is provided just for making the intention
+ * clear.
+ */
+static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci)
+{
+ sci->sc_stage.scnt++;
+ trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt)
+{
+ sci->sc_stage.scnt = next_scnt;
+ trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci)
+{
+ return sci->sc_stage.scnt;
+}
+
/* State flags of collection */
#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */
#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */
@@ -184,11 +214,18 @@ int nilfs_transaction_begin(struct super_block *sb,
{
struct the_nilfs *nilfs;
int ret = nilfs_prepare_segment_lock(ti);
+ struct nilfs_transaction_info *trace_ti;
if (unlikely(ret < 0))
return ret;
- if (ret > 0)
+ if (ret > 0) {
+ trace_ti = current->journal_info;
+
+ trace_nilfs2_transaction_transition(sb, trace_ti,
+ trace_ti->ti_count, trace_ti->ti_flags,
+ TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
+ }
sb_start_intwrite(sb);
@@ -199,6 +236,11 @@ int nilfs_transaction_begin(struct super_block *sb,
ret = -ENOSPC;
goto failed;
}
+
+ trace_ti = current->journal_info;
+ trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count,
+ trace_ti->ti_flags,
+ TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
failed:
@@ -231,6 +273,8 @@ int nilfs_transaction_commit(struct super_block *sb)
ti->ti_flags |= NILFS_TI_COMMIT;
if (ti->ti_count > 0) {
ti->ti_count--;
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
return 0;
}
if (nilfs->ns_writer) {
@@ -242,6 +286,9 @@ int nilfs_transaction_commit(struct super_block *sb)
nilfs_segctor_do_flush(sci, 0);
}
up_read(&nilfs->ns_segctor_sem);
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
+
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_SYNC)
@@ -260,10 +307,15 @@ void nilfs_transaction_abort(struct super_block *sb)
BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
if (ti->ti_count > 0) {
ti->ti_count--;
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
return;
}
up_read(&nilfs->ns_segctor_sem);
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
+
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
kmem_cache_free(nilfs_transaction_cachep, ti);
@@ -309,6 +361,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
current->journal_info = ti;
for (;;) {
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK);
+
down_write(&nilfs->ns_segctor_sem);
if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
break;
@@ -320,6 +375,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
}
if (gcflag)
ti->ti_flags |= NILFS_TI_GC;
+
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK);
}
static void nilfs_transaction_unlock(struct super_block *sb)
@@ -332,6 +390,9 @@ static void nilfs_transaction_unlock(struct super_block *sb)
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
+
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK);
}
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -1062,7 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
size_t ndone;
int err = 0;
- switch (sci->sc_stage.scnt) {
+ switch (nilfs_sc_cstage_get(sci)) {
case NILFS_ST_INIT:
/* Pre-processes */
sci->sc_stage.flags = 0;
@@ -1071,7 +1132,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
sci->sc_nblk_inc = 0;
sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
if (mode == SC_LSEG_DSYNC) {
- sci->sc_stage.scnt = NILFS_ST_DSYNC;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC);
goto dsync_mode;
}
}
@@ -1079,10 +1140,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
sci->sc_stage.dirty_file_ptr = NULL;
sci->sc_stage.gc_inode_ptr = NULL;
if (mode == SC_FLUSH_DAT) {
- sci->sc_stage.scnt = NILFS_ST_DAT;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DAT);
goto dat_stage;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_GC:
if (nilfs_doing_gc()) {
head = &sci->sc_gc_inodes;
@@ -1103,7 +1164,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
sci->sc_stage.gc_inode_ptr = NULL;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_FILE:
head = &sci->sc_dirty_files;
ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
@@ -1125,10 +1186,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
sci->sc_stage.dirty_file_ptr = NULL;
if (mode == SC_FLUSH_FILE) {
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- sci->sc_stage.scnt++;
+ nilfs_sc_cstage_inc(sci);
sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
/* Fall through */
case NILFS_ST_IFILE:
@@ -1136,7 +1197,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++;
+ nilfs_sc_cstage_inc(sci);
/* Creating a checkpoint */
err = nilfs_segctor_create_checkpoint(sci);
if (unlikely(err))
@@ -1147,7 +1208,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_SUFILE:
err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
sci->sc_nfreesegs, &ndone);
@@ -1163,7 +1224,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_DAT:
dat_stage:
err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
@@ -1171,10 +1232,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
break;
if (mode == SC_FLUSH_DAT) {
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_SR:
if (mode == SC_LSEG_SR) {
/* Appending a super root */
@@ -1184,7 +1245,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
/* End of a logical segment */
sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
case NILFS_ST_DSYNC:
dsync_mode:
@@ -1197,7 +1258,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
break;
sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
case NILFS_ST_DONE:
return 0;
@@ -1442,7 +1503,8 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
goto failed;
/* The current segment is filled up */
- if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+ if (mode != SC_LSEG_SR ||
+ nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE)
break;
nilfs_clear_logs(&sci->sc_segbufs);
@@ -1946,7 +2008,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
int err;
- sci->sc_stage.scnt = NILFS_ST_INIT;
+ nilfs_sc_cstage_set(sci, NILFS_ST_INIT);
sci->sc_cno = nilfs->ns_cno;
err = nilfs_segctor_collect_dirty_files(sci, nilfs);
@@ -1974,7 +2036,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
goto failed;
/* Avoid empty segment */
- if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE &&
nilfs_segbuf_empty(sci->sc_curseg)) {
nilfs_segctor_abort_construction(sci, nilfs, 1);
goto out;
@@ -1988,7 +2050,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
nilfs_segctor_fill_in_file_bmap(sci);
if (mode == SC_LSEG_SR &&
- sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
+ nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
err = nilfs_segctor_fill_in_checkpoint(sci);
if (unlikely(err))
goto failed_to_write;
@@ -2007,8 +2069,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
goto failed_to_write;
- if (sci->sc_stage.scnt == NILFS_ST_DONE ||
- nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
+ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
+ nilfs->ns_blocksize_bits != PAGE_SHIFT) {
/*
* At this point, we avoid double buffering
* for blocksize < pagesize because page dirty
@@ -2020,7 +2082,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (err)
goto failed_to_write;
}
- } while (sci->sc_stage.scnt != NILFS_ST_DONE);
+ } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE);
out:
nilfs_segctor_drop_written_files(sci, nilfs);
@@ -2430,7 +2492,6 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
{
int mode = 0;
- int err;
spin_lock(&sci->sc_state_lock);
mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
@@ -2438,7 +2499,7 @@ static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
spin_unlock(&sci->sc_state_lock);
if (mode) {
- err = nilfs_segctor_do_construct(sci, mode);
+ nilfs_segctor_do_construct(sci, mode);
spin_lock(&sci->sc_state_lock);
sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index a48d6de1e02c..0408b9b2814b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -67,7 +67,8 @@ struct nilfs_recovery_info {
/**
* struct nilfs_cstage - Context of collection stage
- * @scnt: Stage count
+ * @scnt: Stage count, must be accessed via wrappers:
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get()
* @flags: State flags
* @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
* @gc_inode_ptr: Pointer on the list of gc-inodes
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 2a869c35c362..52821ffc11f4 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -30,6 +30,8 @@
#include "mdt.h"
#include "sufile.h"
+#include <trace/events/nilfs2.h>
+
/**
* struct nilfs_sufile_info - on-memory private data of sufile
* @mi: on-memory private data of metadata file
@@ -317,7 +319,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
size_t susz = NILFS_MDT(sufile)->mi_entry_size;
__u64 segnum, maxsegnum, last_alloc;
void *kaddr;
- unsigned long nsegments, ncleansegs, nsus, cnt;
+ unsigned long nsegments, nsus, cnt;
int ret, j;
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -327,7 +329,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
goto out_sem;
kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
- ncleansegs = le64_to_cpu(header->sh_ncleansegs);
last_alloc = le64_to_cpu(header->sh_last_alloc);
kunmap_atomic(kaddr);
@@ -358,6 +359,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
break; /* never happens */
}
}
+ trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
&su_bh);
if (ret < 0)
@@ -388,6 +390,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
nilfs_mdt_mark_dirty(sufile);
brelse(su_bh);
*segnump = segnum;
+
+ trace_nilfs2_segment_usage_allocated(sufile, segnum);
+
goto out_header;
}
@@ -490,6 +495,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
NILFS_SUI(sufile)->ncleansegs++;
nilfs_mdt_mark_dirty(sufile);
+
+ trace_nilfs2_segment_usage_freed(sufile, segnum);
}
/**
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f47585bfeb01..7f5d3d9f1c37 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -361,7 +361,7 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
struct nilfs_super_block *nsbp;
sector_t blocknr, newblocknr;
unsigned long offset;
- int sb2i = -1; /* array index of the secondary superblock */
+ int sb2i; /* array index of the secondary superblock */
int ret = 0;
/* nilfs->ns_sem must be locked by the caller. */
@@ -372,6 +372,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
} else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
sb2i = 0;
blocknr = nilfs->ns_sbh[0]->b_blocknr;
+ } else {
+ sb2i = -1;
+ blocknr = 0;
}
if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
goto out; /* super block location is unchanged */
@@ -1313,13 +1316,11 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
if (!s->s_root) {
- char b[BDEVNAME_SIZE];
-
- s_new = true;
+ s_new = true;
/* New superblock instance created */
s->s_mode = mode;
- strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
sb_set_blocksize(s, block_size(sd.bdev));
err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
@@ -1405,21 +1406,18 @@ static void nilfs_destroy_cachep(void)
*/
rcu_barrier();
- if (nilfs_inode_cachep)
- kmem_cache_destroy(nilfs_inode_cachep);
- if (nilfs_transaction_cachep)
- kmem_cache_destroy(nilfs_transaction_cachep);
- if (nilfs_segbuf_cachep)
- kmem_cache_destroy(nilfs_segbuf_cachep);
- if (nilfs_btree_path_cache)
- kmem_cache_destroy(nilfs_btree_path_cache);
+ kmem_cache_destroy(nilfs_inode_cachep);
+ kmem_cache_destroy(nilfs_transaction_cachep);
+ kmem_cache_destroy(nilfs_segbuf_cachep);
+ kmem_cache_destroy(nilfs_btree_path_cache);
}
static int __init nilfs_init_cachep(void)
{
nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
sizeof(struct nilfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+ SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ nilfs_inode_init_once);
if (!nilfs_inode_cachep)
goto fail;
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e785fd954c30..741077deef3b 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
break;
}
spin_unlock(&next_i->i_lock);
- next_i = list_entry(next_i->i_sb_list.next,
- struct inode, i_sb_list);
+ next_i = list_next_entry(next_i, i_sb_list);
}
/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index fc0df4442f7b..7115c5d7d373 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,10 +91,14 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
+#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */
+
struct srcu_struct fsnotify_mark_srcu;
static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
-static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
+
+static void fsnotify_mark_destroy(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
void fsnotify_get_mark(struct fsnotify_mark *mark)
{
@@ -189,7 +193,8 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
spin_lock(&destroy_lock);
list_add(&mark->g_list, &destroy_list);
spin_unlock(&destroy_lock);
- wake_up(&destroy_waitq);
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
/*
* Some groups like to know that marks are being freed. This is a
@@ -388,7 +393,8 @@ err:
spin_lock(&destroy_lock);
list_add(&mark->g_list, &destroy_list);
spin_unlock(&destroy_lock);
- wake_up(&destroy_waitq);
+ queue_delayed_work(system_unbound_wq, &reaper_work,
+ FSNOTIFY_REAPER_DELAY);
return ret;
}
@@ -493,39 +499,20 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
mark->free_mark = free_mark;
}
-static int fsnotify_mark_destroy(void *ignored)
+static void fsnotify_mark_destroy(struct work_struct *work)
{
struct fsnotify_mark *mark, *next;
struct list_head private_destroy_list;
- for (;;) {
- spin_lock(&destroy_lock);
- /* exchange the list head */
- list_replace_init(&destroy_list, &private_destroy_list);
- spin_unlock(&destroy_lock);
-
- synchronize_srcu(&fsnotify_mark_srcu);
+ spin_lock(&destroy_lock);
+ /* exchange the list head */
+ list_replace_init(&destroy_list, &private_destroy_list);
+ spin_unlock(&destroy_lock);
- list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
- list_del_init(&mark->g_list);
- fsnotify_put_mark(mark);
- }
+ synchronize_srcu(&fsnotify_mark_srcu);
- wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
+ list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
+ list_del_init(&mark->g_list);
+ fsnotify_put_mark(mark);
}
-
- return 0;
-}
-
-static int __init fsnotify_mark_init(void)
-{
- struct task_struct *thread;
-
- thread = kthread_run(fsnotify_mark_destroy, NULL,
- "fsnotify_mark");
- if (IS_ERR(thread))
- panic("unable to start fsnotify mark destruction thread.");
-
- return 0;
}
-device_initcall(fsnotify_mark_init);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 7521e11db728..97768a1379f2 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -74,7 +74,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
- file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
+ file_ofs = ((s64)page->index << PAGE_SHIFT) +
bh_offset(bh);
read_lock_irqsave(&ni->size_lock, flags);
init_size = ni->initialized_size;
@@ -142,7 +142,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
u32 rec_size;
rec_size = ni->itype.index.block_size;
- recs = PAGE_CACHE_SIZE / rec_size;
+ recs = PAGE_SIZE / rec_size;
/* Should have been verified before we got here... */
BUG_ON(!recs);
local_irq_save(flags);
@@ -229,7 +229,7 @@ static int ntfs_read_block(struct page *page)
* fully truncated, truncate will throw it away as soon as we unlock
* it so no need to worry what we do with it.
*/
- iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+ iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
read_lock_irqsave(&ni->size_lock, flags);
lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
init_size = ni->initialized_size;
@@ -412,9 +412,9 @@ retry_readpage:
vi = page->mapping->host;
i_size = i_size_read(vi);
/* Is the page fully outside i_size? (truncate in progress) */
- if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT)) {
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+ PAGE_SHIFT)) {
+ zero_user(page, 0, PAGE_SIZE);
ntfs_debug("Read outside i_size - truncated?");
goto done;
}
@@ -463,7 +463,7 @@ retry_readpage:
* ok to ignore the compressed flag here.
*/
if (unlikely(page->index > 0)) {
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
goto done;
}
if (!NInoAttr(ni))
@@ -509,7 +509,7 @@ retry_readpage:
le16_to_cpu(ctx->attr->data.resident.value_offset),
attr_len);
/* Zero the remainder of the page. */
- memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+ memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
flush_dcache_page(page);
kunmap_atomic(addr);
put_unm_err_out:
@@ -599,7 +599,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
/* NOTE: Different naming scheme to ntfs_read_block()! */
/* The first block in the page. */
- block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+ block = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
read_lock_irqsave(&ni->size_lock, flags);
i_size = i_size_read(vi);
@@ -674,7 +674,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
// in the inode.
// Again, for each page do:
// __set_page_dirty_buffers();
- // page_cache_release()
+ // put_page()
// We don't need to wait on the writes.
// Update iblock.
}
@@ -925,7 +925,7 @@ static int ntfs_write_mst_block(struct page *page,
ntfs_volume *vol = ni->vol;
u8 *kaddr;
unsigned int rec_size = ni->itype.index.block_size;
- ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
+ ntfs_inode *locked_nis[PAGE_SIZE / rec_size];
struct buffer_head *bh, *head, *tbh, *rec_start_bh;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
runlist_element *rl;
@@ -949,7 +949,7 @@ static int ntfs_write_mst_block(struct page *page,
(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
bh_size = vol->sb->s_blocksize;
bh_size_bits = vol->sb->s_blocksize_bits;
- max_bhs = PAGE_CACHE_SIZE / bh_size;
+ max_bhs = PAGE_SIZE / bh_size;
BUG_ON(!max_bhs);
BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
@@ -961,13 +961,13 @@ static int ntfs_write_mst_block(struct page *page,
BUG_ON(!bh);
rec_size_bits = ni->itype.index.block_size_bits;
- BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
+ BUG_ON(!(PAGE_SIZE >> rec_size_bits));
bhs_per_rec = rec_size >> bh_size_bits;
BUG_ON(!bhs_per_rec);
/* The first block in the page. */
rec_block = block = (sector_t)page->index <<
- (PAGE_CACHE_SHIFT - bh_size_bits);
+ (PAGE_SHIFT - bh_size_bits);
/* The first out of bounds block for the data size. */
dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
@@ -1133,7 +1133,7 @@ lock_retry_remap:
unsigned long mft_no;
/* Get the mft record number. */
- mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+ mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
>> rec_size_bits;
/* Check whether to write this mft record. */
tni = NULL;
@@ -1249,7 +1249,7 @@ do_mirror:
continue;
ofs = bh_offset(tbh);
/* Get the mft record number. */
- mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+ mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
>> rec_size_bits;
if (mft_no < vol->mftmirr_size)
ntfs_sync_mft_mirror(vol, mft_no,
@@ -1300,7 +1300,7 @@ done:
* Set page error if there is only one ntfs record in the page.
* Otherwise we would loose per-record granularity.
*/
- if (ni->itype.index.block_size == PAGE_CACHE_SIZE)
+ if (ni->itype.index.block_size == PAGE_SIZE)
SetPageError(page);
NVolSetErrors(vol);
}
@@ -1308,7 +1308,7 @@ done:
ntfs_debug("Page still contains one or more dirty ntfs "
"records. Redirtying the page starting at "
"record 0x%lx.", page->index <<
- (PAGE_CACHE_SHIFT - rec_size_bits));
+ (PAGE_SHIFT - rec_size_bits));
redirty_page_for_writepage(wbc, page);
unlock_page(page);
} else {
@@ -1365,13 +1365,13 @@ retry_writepage:
BUG_ON(!PageLocked(page));
i_size = i_size_read(vi);
/* Is the page fully outside i_size? (truncate in progress) */
- if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT)) {
+ if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+ PAGE_SHIFT)) {
/*
* The page may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
- block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ block_invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
ntfs_debug("Write outside i_size - truncated?");
return 0;
@@ -1414,10 +1414,10 @@ retry_writepage:
/* NInoNonResident() == NInoIndexAllocPresent() */
if (NInoNonResident(ni)) {
/* We have to zero every time due to mmap-at-end-of-file. */
- if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
+ if (page->index >= (i_size >> PAGE_SHIFT)) {
/* The page straddles i_size. */
- unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
- zero_user_segment(page, ofs, PAGE_CACHE_SIZE);
+ unsigned int ofs = i_size & ~PAGE_MASK;
+ zero_user_segment(page, ofs, PAGE_SIZE);
}
/* Handle mst protected attributes. */
if (NInoMstProtected(ni))
@@ -1500,7 +1500,7 @@ retry_writepage:
le16_to_cpu(ctx->attr->data.resident.value_offset),
addr, attr_len);
/* Zero out of bounds area in the page cache page. */
- memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+ memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
kunmap_atomic(addr);
flush_dcache_page(page);
flush_dcache_mft_record_page(ctx->ntfs_ino);
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index caecc58f529c..820d6eabf60f 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -40,7 +40,7 @@
static inline void ntfs_unmap_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
/**
@@ -49,7 +49,7 @@ static inline void ntfs_unmap_page(struct page *page)
* @index: index into the page cache for @mapping of the page to map
*
* Read a page from the page cache of the address space @mapping at position
- * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes.
+ * @index, where @index is in units of PAGE_SIZE, and not in bytes.
*
* If the page is not in memory it is loaded from disk first using the readpage
* method defined in the address space operations of @mapping and the page is
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 250ed5b20c8f..44a39a099b54 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -152,7 +152,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
old_ctx.base_ntfs_ino) {
put_this_page = old_ctx.ntfs_ino->page;
- page_cache_get(put_this_page);
+ get_page(put_this_page);
}
/*
* Reinitialize the search context so we can lookup the
@@ -275,7 +275,7 @@ retry_map:
* the pieces anyway.
*/
if (put_this_page)
- page_cache_release(put_this_page);
+ put_page(put_this_page);
}
return err;
}
@@ -1660,7 +1660,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
memcpy(kaddr, (u8*)a +
le16_to_cpu(a->data.resident.value_offset),
attr_size);
- memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size);
+ memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size);
kunmap_atomic(kaddr);
flush_dcache_page(page);
SetPageUptodate(page);
@@ -1748,7 +1748,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
if (page) {
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
ntfs_debug("Done.");
return 0;
@@ -1835,7 +1835,7 @@ rl_err_out:
ntfs_free(rl);
page_err_out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
if (err == -EINVAL)
err = -EIO;
@@ -2513,17 +2513,17 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
BUG_ON(NInoEncrypted(ni));
mapping = VFS_I(ni)->i_mapping;
/* Work out the starting index and page offset. */
- idx = ofs >> PAGE_CACHE_SHIFT;
- start_ofs = ofs & ~PAGE_CACHE_MASK;
+ idx = ofs >> PAGE_SHIFT;
+ start_ofs = ofs & ~PAGE_MASK;
/* Work out the ending index and page offset. */
end = ofs + cnt;
- end_ofs = end & ~PAGE_CACHE_MASK;
+ end_ofs = end & ~PAGE_MASK;
/* If the end is outside the inode size return -ESPIPE. */
if (unlikely(end > i_size_read(VFS_I(ni)))) {
ntfs_error(vol->sb, "Request exceeds end of attribute.");
return -ESPIPE;
}
- end >>= PAGE_CACHE_SHIFT;
+ end >>= PAGE_SHIFT;
/* If there is a first partial page, need to do it the slow way. */
if (start_ofs) {
page = read_mapping_page(mapping, idx, NULL);
@@ -2536,7 +2536,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
* If the last page is the same as the first page, need to
* limit the write to the end offset.
*/
- size = PAGE_CACHE_SIZE;
+ size = PAGE_SIZE;
if (idx == end)
size = end_ofs;
kaddr = kmap_atomic(page);
@@ -2544,7 +2544,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
flush_dcache_page(page);
kunmap_atomic(kaddr);
set_page_dirty(page);
- page_cache_release(page);
+ put_page(page);
balance_dirty_pages_ratelimited(mapping);
cond_resched();
if (idx == end)
@@ -2561,7 +2561,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
return -ENOMEM;
}
kaddr = kmap_atomic(page);
- memset(kaddr, val, PAGE_CACHE_SIZE);
+ memset(kaddr, val, PAGE_SIZE);
flush_dcache_page(page);
kunmap_atomic(kaddr);
/*
@@ -2585,7 +2585,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
set_page_dirty(page);
/* Finally unlock and release the page. */
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
balance_dirty_pages_ratelimited(mapping);
cond_resched();
}
@@ -2602,7 +2602,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
flush_dcache_page(page);
kunmap_atomic(kaddr);
set_page_dirty(page);
- page_cache_release(page);
+ put_page(page);
balance_dirty_pages_ratelimited(mapping);
cond_resched();
}
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
index 0809cf876098..ec130c588d2b 100644
--- a/fs/ntfs/bitmap.c
+++ b/fs/ntfs/bitmap.c
@@ -67,8 +67,8 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
* Calculate the indices for the pages containing the first and last
* bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
*/
- index = start_bit >> (3 + PAGE_CACHE_SHIFT);
- end_index = (start_bit + cnt - 1) >> (3 + PAGE_CACHE_SHIFT);
+ index = start_bit >> (3 + PAGE_SHIFT);
+ end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT);
/* Get the page containing the first bit (@start_bit). */
mapping = vi->i_mapping;
@@ -82,7 +82,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
kaddr = page_address(page);
/* Set @pos to the position of the byte containing @start_bit. */
- pos = (start_bit >> 3) & ~PAGE_CACHE_MASK;
+ pos = (start_bit >> 3) & ~PAGE_MASK;
/* Calculate the position of @start_bit in the first byte. */
bit = start_bit & 7;
@@ -108,7 +108,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
* Depending on @value, modify all remaining whole bytes in the page up
* to @cnt.
*/
- len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE - pos);
+ len = min_t(s64, cnt >> 3, PAGE_SIZE - pos);
memset(kaddr + pos, value ? 0xff : 0, len);
cnt -= len << 3;
@@ -132,7 +132,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
* Depending on @value, modify all remaining whole bytes in the
* page up to @cnt.
*/
- len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE);
+ len = min_t(s64, cnt >> 3, PAGE_SIZE);
memset(kaddr, value ? 0xff : 0, len);
cnt -= len << 3;
}
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index f82498c35e78..f2b5e746f49b 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -104,16 +104,12 @@ static void zero_partial_compressed_page(struct page *page,
unsigned int kp_ofs;
ntfs_debug("Zeroing page region outside initialized size.");
- if (((s64)page->index << PAGE_CACHE_SHIFT) >= initialized_size) {
- /*
- * FIXME: Using clear_page() will become wrong when we get
- * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem.
- */
+ if (((s64)page->index << PAGE_SHIFT) >= initialized_size) {
clear_page(kp);
return;
}
- kp_ofs = initialized_size & ~PAGE_CACHE_MASK;
- memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs);
+ kp_ofs = initialized_size & ~PAGE_MASK;
+ memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs);
return;
}
@@ -123,7 +119,7 @@ static void zero_partial_compressed_page(struct page *page,
static inline void handle_bounds_compressed_page(struct page *page,
const loff_t i_size, const s64 initialized_size)
{
- if ((page->index >= (initialized_size >> PAGE_CACHE_SHIFT)) &&
+ if ((page->index >= (initialized_size >> PAGE_SHIFT)) &&
(initialized_size < i_size))
zero_partial_compressed_page(page, initialized_size);
return;
@@ -160,7 +156,7 @@ static inline void handle_bounds_compressed_page(struct page *page,
* @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
* completed during the decompression of the compression block (@cb_start).
*
- * Warning: This function *REQUIRES* PAGE_CACHE_SIZE >= 4096 or it will blow up
+ * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up
* unpredicatbly! You have been warned!
*
* Note to hackers: This function may not sleep until it has finished accessing
@@ -241,7 +237,7 @@ return_error:
if (di == xpage)
*xpage_done = 1;
else
- page_cache_release(dp);
+ put_page(dp);
dest_pages[di] = NULL;
}
}
@@ -274,7 +270,7 @@ return_error:
cb = cb_sb_end;
/* Advance destination position to next sub-block. */
- *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_CACHE_MASK;
+ *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK;
if (!*dest_ofs && (++*dest_index > dest_max_index))
goto return_overflow;
goto do_next_sb;
@@ -301,7 +297,7 @@ return_error:
/* Advance destination position to next sub-block. */
*dest_ofs += NTFS_SB_SIZE;
- if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) {
+ if (!(*dest_ofs &= ~PAGE_MASK)) {
finalize_page:
/*
* First stage: add current page index to array of
@@ -335,7 +331,7 @@ do_next_tag:
*dest_ofs += nr_bytes;
}
/* We have finished the current sub-block. */
- if (!(*dest_ofs &= ~PAGE_CACHE_MASK))
+ if (!(*dest_ofs &= ~PAGE_MASK))
goto finalize_page;
goto do_next_sb;
}
@@ -462,7 +458,7 @@ return_overflow:
* have been written to so that we would lose data if we were to just overwrite
* them with the out-of-date uncompressed data.
*
- * FIXME: For PAGE_CACHE_SIZE > cb_size we are not doing the Right Thing(TM) at
+ * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at
* the end of the file I think. We need to detect this case and zero the out
* of bounds remainder of the page in question and mark it as handled. At the
* moment we would just return -EIO on such a page. This bug will only become
@@ -470,7 +466,7 @@ return_overflow:
* clusters so is probably not going to be seen by anyone. Still this should
* be fixed. (AIA)
*
- * FIXME: Again for PAGE_CACHE_SIZE > cb_size we are screwing up both in
+ * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in
* handling sparse and compressed cbs. (AIA)
*
* FIXME: At the moment we don't do any zeroing out in the case that
@@ -497,14 +493,14 @@ int ntfs_read_compressed_block(struct page *page)
u64 cb_size_mask = cb_size - 1UL;
VCN vcn;
LCN lcn;
- /* The first wanted vcn (minimum alignment is PAGE_CACHE_SIZE). */
- VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >>
+ /* The first wanted vcn (minimum alignment is PAGE_SIZE). */
+ VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >>
vol->cluster_size_bits;
/*
* The first vcn after the last wanted vcn (minimum alignment is again
- * PAGE_CACHE_SIZE.
+ * PAGE_SIZE.
*/
- VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1)
+ VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1)
& ~cb_size_mask) >> vol->cluster_size_bits;
/* Number of compression blocks (cbs) in the wanted vcn range. */
unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
@@ -515,7 +511,7 @@ int ntfs_read_compressed_block(struct page *page)
* guarantees of start_vcn and end_vcn, no need to round up here.
*/
unsigned int nr_pages = (end_vcn - start_vcn) <<
- vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+ vol->cluster_size_bits >> PAGE_SHIFT;
unsigned int xpage, max_page, cur_page, cur_ofs, i;
unsigned int cb_clusters, cb_max_ofs;
int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
@@ -549,7 +545,7 @@ int ntfs_read_compressed_block(struct page *page)
* We have already been given one page, this is the one we must do.
* Once again, the alignment guarantees keep it simple.
*/
- offset = start_vcn << vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+ offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT;
xpage = index - offset;
pages[xpage] = page;
/*
@@ -560,13 +556,13 @@ int ntfs_read_compressed_block(struct page *page)
i_size = i_size_read(VFS_I(ni));
initialized_size = ni->initialized_size;
read_unlock_irqrestore(&ni->size_lock, flags);
- max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+ max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) -
offset;
/* Is the page fully outside i_size? (truncate in progress) */
if (xpage >= max_page) {
kfree(bhs);
kfree(pages);
- zero_user(page, 0, PAGE_CACHE_SIZE);
+ zero_user(page, 0, PAGE_SIZE);
ntfs_debug("Compressed read outside i_size - truncated?");
SetPageUptodate(page);
unlock_page(page);
@@ -591,7 +587,7 @@ int ntfs_read_compressed_block(struct page *page)
continue;
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
pages[i] = NULL;
}
}
@@ -735,9 +731,9 @@ lock_retry_remap:
ntfs_debug("Successfully read the compression block.");
/* The last page and maximum offset within it for the current cb. */
- cb_max_page = (cur_page << PAGE_CACHE_SHIFT) + cur_ofs + cb_size;
- cb_max_ofs = cb_max_page & ~PAGE_CACHE_MASK;
- cb_max_page >>= PAGE_CACHE_SHIFT;
+ cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size;
+ cb_max_ofs = cb_max_page & ~PAGE_MASK;
+ cb_max_page >>= PAGE_SHIFT;
/* Catch end of file inside a compression block. */
if (cb_max_page > max_page)
@@ -753,16 +749,11 @@ lock_retry_remap:
for (; cur_page < cb_max_page; cur_page++) {
page = pages[cur_page];
if (page) {
- /*
- * FIXME: Using clear_page() will become wrong
- * when we get PAGE_CACHE_SIZE != PAGE_SIZE but
- * for now there is no problem.
- */
if (likely(!cur_ofs))
clear_page(page_address(page));
else
memset(page_address(page) + cur_ofs, 0,
- PAGE_CACHE_SIZE -
+ PAGE_SIZE -
cur_ofs);
flush_dcache_page(page);
kunmap(page);
@@ -771,10 +762,10 @@ lock_retry_remap:
if (cur_page == xpage)
xpage_done = 1;
else
- page_cache_release(page);
+ put_page(page);
pages[cur_page] = NULL;
}
- cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+ cb_pos += PAGE_SIZE - cur_ofs;
cur_ofs = 0;
if (cb_pos >= cb_end)
break;
@@ -807,7 +798,7 @@ lock_retry_remap:
* synchronous io for the majority of pages.
* Or if we choose not to do the read-ahead/-behind stuff, we
* could just return block_read_full_page(pages[xpage]) as long
- * as PAGE_CACHE_SIZE <= cb_size.
+ * as PAGE_SIZE <= cb_size.
*/
if (cb_max_ofs)
cb_max_page--;
@@ -816,8 +807,8 @@ lock_retry_remap:
page = pages[cur_page];
if (page)
memcpy(page_address(page) + cur_ofs, cb_pos,
- PAGE_CACHE_SIZE - cur_ofs);
- cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+ PAGE_SIZE - cur_ofs);
+ cb_pos += PAGE_SIZE - cur_ofs;
cur_ofs = 0;
if (cb_pos >= cb_end)
break;
@@ -850,10 +841,10 @@ lock_retry_remap:
if (cur2_page == xpage)
xpage_done = 1;
else
- page_cache_release(page);
+ put_page(page);
pages[cur2_page] = NULL;
}
- cb_pos2 += PAGE_CACHE_SIZE - cur_ofs2;
+ cb_pos2 += PAGE_SIZE - cur_ofs2;
cur_ofs2 = 0;
if (cb_pos2 >= cb_end)
break;
@@ -884,7 +875,7 @@ lock_retry_remap:
kunmap(page);
unlock_page(page);
if (prev_cur_page != xpage)
- page_cache_release(page);
+ put_page(page);
pages[prev_cur_page] = NULL;
}
}
@@ -914,7 +905,7 @@ lock_retry_remap:
kunmap(page);
unlock_page(page);
if (cur_page != xpage)
- page_cache_release(page);
+ put_page(page);
pages[cur_page] = NULL;
}
}
@@ -961,7 +952,7 @@ err_out:
kunmap(page);
unlock_page(page);
if (i != xpage)
- page_cache_release(page);
+ put_page(page);
}
}
kfree(pages);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9e38dafa3bc7..a18613579001 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -315,11 +315,11 @@ found_it:
descend_into_child_node:
/*
* Convert vcn to index into the index allocation attribute in units
- * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+ * of PAGE_SIZE and map the page cache page, reading it from
* disk if necessary.
*/
page = ntfs_map_page(ia_mapping, vcn <<
- dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+ dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
if (IS_ERR(page)) {
ntfs_error(sb, "Failed to map directory index page, error %ld.",
-PTR_ERR(page));
@@ -331,9 +331,9 @@ descend_into_child_node:
fast_descend_into_child_node:
/* Get to the index allocation block. */
ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+ dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
/* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+ if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
"inode 0x%lx or driver bug.", dir_ni->mft_no);
goto unm_err_out;
@@ -366,7 +366,7 @@ fast_descend_into_child_node:
goto unm_err_out;
}
index_end = (u8*)ia + dir_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_CACHE_SIZE) {
+ if (index_end > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
"0x%lx crosses page boundary. Impossible! "
"Cannot access! This is probably a bug in the "
@@ -559,9 +559,9 @@ found_it2:
/* If vcn is in the same page cache page as old_vcn we
* recycle the mapped page. */
if (old_vcn << vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT == vcn <<
+ PAGE_SHIFT == vcn <<
vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT)
+ PAGE_SHIFT)
goto fast_descend_into_child_node;
unlock_page(page);
ntfs_unmap_page(page);
@@ -793,11 +793,11 @@ found_it:
descend_into_child_node:
/*
* Convert vcn to index into the index allocation attribute in units
- * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+ * of PAGE_SIZE and map the page cache page, reading it from
* disk if necessary.
*/
page = ntfs_map_page(ia_mapping, vcn <<
- dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+ dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
if (IS_ERR(page)) {
ntfs_error(sb, "Failed to map directory index page, error %ld.",
-PTR_ERR(page));
@@ -809,9 +809,9 @@ descend_into_child_node:
fast_descend_into_child_node:
/* Get to the index allocation block. */
ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+ dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
/* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+ if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
"inode 0x%lx or driver bug.", dir_ni->mft_no);
goto unm_err_out;
@@ -844,7 +844,7 @@ fast_descend_into_child_node:
goto unm_err_out;
}
index_end = (u8*)ia + dir_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_CACHE_SIZE) {
+ if (index_end > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
"0x%lx crosses page boundary. Impossible! "
"Cannot access! This is probably a bug in the "
@@ -968,9 +968,9 @@ found_it2:
/* If vcn is in the same page cache page as old_vcn we
* recycle the mapped page. */
if (old_vcn << vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT == vcn <<
+ PAGE_SHIFT == vcn <<
vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT)
+ PAGE_SHIFT)
goto fast_descend_into_child_node;
unlock_page(page);
ntfs_unmap_page(page);
@@ -1246,15 +1246,15 @@ skip_index_root:
goto iput_err_out;
}
/* Get the starting bit position in the current bitmap page. */
- cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1);
- bmp_pos &= ~(u64)((PAGE_CACHE_SIZE * 8) - 1);
+ cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1);
+ bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1);
get_next_bmp_page:
ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
- (unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT),
+ (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT),
(unsigned long long)bmp_pos &
- (unsigned long long)((PAGE_CACHE_SIZE * 8) - 1));
+ (unsigned long long)((PAGE_SIZE * 8) - 1));
bmp_page = ntfs_map_page(bmp_mapping,
- bmp_pos >> (3 + PAGE_CACHE_SHIFT));
+ bmp_pos >> (3 + PAGE_SHIFT));
if (IS_ERR(bmp_page)) {
ntfs_error(sb, "Reading index bitmap failed.");
err = PTR_ERR(bmp_page);
@@ -1270,9 +1270,9 @@ find_next_index_buffer:
* If we have reached the end of the bitmap page, get the next
* page, and put away the old one.
*/
- if (unlikely((cur_bmp_pos >> 3) >= PAGE_CACHE_SIZE)) {
+ if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) {
ntfs_unmap_page(bmp_page);
- bmp_pos += PAGE_CACHE_SIZE * 8;
+ bmp_pos += PAGE_SIZE * 8;
cur_bmp_pos = 0;
goto get_next_bmp_page;
}
@@ -1285,8 +1285,8 @@ find_next_index_buffer:
ntfs_debug("Handling index buffer 0x%llx.",
(unsigned long long)bmp_pos + cur_bmp_pos);
/* If the current index buffer is in the same page we reuse the page. */
- if ((prev_ia_pos & (s64)PAGE_CACHE_MASK) !=
- (ia_pos & (s64)PAGE_CACHE_MASK)) {
+ if ((prev_ia_pos & (s64)PAGE_MASK) !=
+ (ia_pos & (s64)PAGE_MASK)) {
prev_ia_pos = ia_pos;
if (likely(ia_page != NULL)) {
unlock_page(ia_page);
@@ -1296,7 +1296,7 @@ find_next_index_buffer:
* Map the page cache page containing the current ia_pos,
* reading it from disk if necessary.
*/
- ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT);
+ ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT);
if (IS_ERR(ia_page)) {
ntfs_error(sb, "Reading index allocation data failed.");
err = PTR_ERR(ia_page);
@@ -1307,10 +1307,10 @@ find_next_index_buffer:
kaddr = (u8*)page_address(ia_page);
}
/* Get the current index buffer. */
- ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_CACHE_MASK &
- ~(s64)(ndir->itype.index.block_size - 1)));
+ ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK &
+ ~(s64)(ndir->itype.index.block_size - 1)));
/* Bounds checks. */
- if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE)) {
+ if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) {
ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
"inode 0x%lx or driver bug.", vdir->i_ino);
goto err_out;
@@ -1348,7 +1348,7 @@ find_next_index_buffer:
goto err_out;
}
index_end = (u8*)ia + ndir->itype.index.block_size;
- if (unlikely(index_end > kaddr + PAGE_CACHE_SIZE)) {
+ if (unlikely(index_end > kaddr + PAGE_SIZE)) {
ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
"0x%lx crosses page boundary. Impossible! "
"Cannot access! This is probably a bug in the "
@@ -1509,7 +1509,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
err = filemap_write_and_wait_range(vi->i_mapping, start, end);
if (err)
return err;
- mutex_lock(&vi->i_mutex);
+ inode_lock(vi);
BUG_ON(!S_ISDIR(vi->i_mode));
/* If the bitmap attribute inode is in memory sync it, too. */
@@ -1532,7 +1532,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
else
ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
"%u.", datasync ? "data" : "", vi->i_ino, -ret);
- mutex_unlock(&vi->i_mutex);
+ inode_unlock(vi);
return ret;
}
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 262561fea923..91117ada8528 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -220,8 +220,8 @@ do_non_resident_extend:
m = NULL;
}
mapping = vi->i_mapping;
- index = old_init_size >> PAGE_CACHE_SHIFT;
- end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ index = old_init_size >> PAGE_SHIFT;
+ end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
do {
/*
* Read the page. If the page is not present, this will zero
@@ -233,7 +233,7 @@ do_non_resident_extend:
goto init_err_out;
}
if (unlikely(PageError(page))) {
- page_cache_release(page);
+ put_page(page);
err = -EIO;
goto init_err_out;
}
@@ -242,13 +242,13 @@ do_non_resident_extend:
* enough to make ntfs_writepage() work.
*/
write_lock_irqsave(&ni->size_lock, flags);
- ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT;
+ ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT;
if (ni->initialized_size > new_init_size)
ni->initialized_size = new_init_size;
write_unlock_irqrestore(&ni->size_lock, flags);
/* Set the page dirty so it gets written out. */
set_page_dirty(page);
- page_cache_release(page);
+ put_page(page);
/*
* Play nice with the vm and the rest of the system. This is
* very much needed as we can potentially be modifying the
@@ -525,8 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
}
}
err = add_to_page_cache_lru(*cached_page, mapping,
- index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(err)) {
if (err == -EEXIST)
continue;
@@ -543,7 +543,7 @@ out:
err_out:
while (nr > 0) {
unlock_page(pages[--nr]);
- page_cache_release(pages[nr]);
+ put_page(pages[nr]);
}
goto out;
}
@@ -573,7 +573,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
* only partially being written to.
*
* If @nr_pages is greater than one, we are guaranteed that the cluster size is
- * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
+ * greater than PAGE_SIZE, that all pages in @pages are entirely inside
* the same cluster and that they are the entirety of that cluster, and that
* the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
*
@@ -653,7 +653,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
u = 0;
do_next_page:
page = pages[u];
- bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+ bh_pos = (s64)page->index << PAGE_SHIFT;
bh = head = page_buffers(page);
do {
VCN cdelta;
@@ -810,11 +810,11 @@ map_buffer_cached:
kaddr = kmap_atomic(page);
if (bh_pos < pos) {
- pofs = bh_pos & ~PAGE_CACHE_MASK;
+ pofs = bh_pos & ~PAGE_MASK;
memset(kaddr + pofs, 0, pos - bh_pos);
}
if (bh_end > end) {
- pofs = end & ~PAGE_CACHE_MASK;
+ pofs = end & ~PAGE_MASK;
memset(kaddr + pofs, 0, bh_end - end);
}
kunmap_atomic(kaddr);
@@ -942,7 +942,7 @@ rl_not_mapped_enoent:
* unmapped. This can only happen when the cluster size is
* less than the page cache size.
*/
- if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
+ if (unlikely(vol->cluster_size < PAGE_SIZE)) {
bh_cend = (bh_end + vol->cluster_size - 1) >>
vol->cluster_size_bits;
if ((bh_cend <= cpos || bh_cpos >= cend)) {
@@ -1208,7 +1208,7 @@ rl_not_mapped_enoent:
wait_on_buffer(bh);
if (likely(buffer_uptodate(bh))) {
page = bh->b_page;
- bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
+ bh_pos = ((s64)page->index << PAGE_SHIFT) +
bh_offset(bh);
/*
* If the buffer overflows the initialized size, need
@@ -1350,7 +1350,7 @@ rl_not_mapped_enoent:
bh = head = page_buffers(page);
do {
if (u == nr_pages &&
- ((s64)page->index << PAGE_CACHE_SHIFT) +
+ ((s64)page->index << PAGE_SHIFT) +
bh_offset(bh) >= end)
break;
if (!buffer_new(bh))
@@ -1422,7 +1422,7 @@ static inline int ntfs_commit_pages_after_non_resident_write(
bool partial;
page = pages[u];
- bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+ bh_pos = (s64)page->index << PAGE_SHIFT;
bh = head = page_buffers(page);
partial = false;
do {
@@ -1639,7 +1639,7 @@ static int ntfs_commit_pages_after_write(struct page **pages,
if (end < attr_len)
memcpy(kaddr + end, kattr + end, attr_len - end);
/* Zero the region outside the end of the attribute value. */
- memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+ memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len);
flush_dcache_page(page);
SetPageUptodate(page);
}
@@ -1706,7 +1706,7 @@ static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
unsigned len, copied;
do {
- len = PAGE_CACHE_SIZE - ofs;
+ len = PAGE_SIZE - ofs;
if (len > bytes)
len = bytes;
copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
@@ -1724,14 +1724,14 @@ out:
return total;
err:
/* Zero the rest of the target like __copy_from_user(). */
- len = PAGE_CACHE_SIZE - copied;
+ len = PAGE_SIZE - copied;
do {
if (len > bytes)
len = bytes;
zero_user(*pages, copied, len);
bytes -= len;
copied = 0;
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
} while (++pages < last_page);
goto out;
}
@@ -1787,8 +1787,8 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
* attributes.
*/
nr_pages = 1;
- if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
- nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
+ if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
+ nr_pages = vol->cluster_size >> PAGE_SHIFT;
last_vcn = -1;
do {
VCN vcn;
@@ -1796,9 +1796,9 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
unsigned ofs, do_pages, u;
size_t copied;
- start_idx = idx = pos >> PAGE_CACHE_SHIFT;
- ofs = pos & ~PAGE_CACHE_MASK;
- bytes = PAGE_CACHE_SIZE - ofs;
+ start_idx = idx = pos >> PAGE_SHIFT;
+ ofs = pos & ~PAGE_MASK;
+ bytes = PAGE_SIZE - ofs;
do_pages = 1;
if (nr_pages > 1) {
vcn = pos >> vol->cluster_size_bits;
@@ -1832,7 +1832,7 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
if (lcn == LCN_HOLE) {
start_idx = (pos & ~(s64)
vol->cluster_size_mask)
- >> PAGE_CACHE_SHIFT;
+ >> PAGE_SHIFT;
bytes = vol->cluster_size - (pos &
vol->cluster_size_mask);
do_pages = nr_pages;
@@ -1871,12 +1871,12 @@ again:
if (unlikely(status)) {
do {
unlock_page(pages[--do_pages]);
- page_cache_release(pages[do_pages]);
+ put_page(pages[do_pages]);
} while (do_pages);
break;
}
}
- u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
+ u = (pos >> PAGE_SHIFT) - pages[0]->index;
copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
i, bytes);
ntfs_flush_dcache_pages(pages + u, do_pages - u);
@@ -1889,7 +1889,7 @@ again:
}
do {
unlock_page(pages[--do_pages]);
- page_cache_release(pages[do_pages]);
+ put_page(pages[do_pages]);
} while (do_pages);
if (unlikely(status < 0))
break;
@@ -1921,7 +1921,7 @@ again:
}
} while (iov_iter_count(i));
if (cached_page)
- page_cache_release(cached_page);
+ put_page(cached_page);
ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
written ? "written" : "status", (unsigned long)written,
(long)status);
@@ -1944,14 +1944,14 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t written = 0;
ssize_t err;
- mutex_lock(&vi->i_mutex);
+ inode_lock(vi);
/* We can write back this queue in page reclaim. */
current->backing_dev_info = inode_to_bdi(vi);
err = ntfs_prepare_file_for_write(iocb, from);
if (iov_iter_count(from) && !err)
written = ntfs_perform_write(file, from, iocb->ki_pos);
current->backing_dev_info = NULL;
- mutex_unlock(&vi->i_mutex);
+ inode_unlock(vi);
if (likely(written > 0)) {
err = generic_write_sync(file, iocb->ki_pos, written);
if (err < 0)
@@ -1996,7 +1996,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
err = filemap_write_and_wait_range(vi->i_mapping, start, end);
if (err)
return err;
- mutex_lock(&vi->i_mutex);
+ inode_lock(vi);
BUG_ON(S_ISDIR(vi->i_mode));
if (!datasync || !NInoNonResident(NTFS_I(vi)))
@@ -2015,7 +2015,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
else
ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
"%u.", datasync ? "data" : "", vi->i_ino, -ret);
- mutex_unlock(&vi->i_mutex);
+ inode_unlock(vi);
return ret;
}
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 096c135691ae..0d645f357930 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -272,11 +272,11 @@ done:
descend_into_child_node:
/*
* Convert vcn to index into the index allocation attribute in units
- * of PAGE_CACHE_SIZE and map the page cache page, reading it from
+ * of PAGE_SIZE and map the page cache page, reading it from
* disk if necessary.
*/
page = ntfs_map_page(ia_mapping, vcn <<
- idx_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+ idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
if (IS_ERR(page)) {
ntfs_error(sb, "Failed to map index page, error %ld.",
-PTR_ERR(page));
@@ -288,9 +288,9 @@ descend_into_child_node:
fast_descend_into_child_node:
/* Get to the index allocation block. */
ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
- idx_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+ idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
/* Bounds checks. */
- if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+ if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Out of bounds check failed. Corrupt inode "
"0x%lx or driver bug.", idx_ni->mft_no);
goto unm_err_out;
@@ -323,7 +323,7 @@ fast_descend_into_child_node:
goto unm_err_out;
}
index_end = (u8*)ia + idx_ni->itype.index.block_size;
- if (index_end > kaddr + PAGE_CACHE_SIZE) {
+ if (index_end > kaddr + PAGE_SIZE) {
ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
"crosses page boundary. Impossible! Cannot "
"access! This is probably a bug in the "
@@ -427,9 +427,9 @@ ia_done:
* the mapped page.
*/
if (old_vcn << vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT == vcn <<
+ PAGE_SHIFT == vcn <<
vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT)
+ PAGE_SHIFT)
goto fast_descend_into_child_node;
unlock_page(page);
ntfs_unmap_page(page);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d284f07eda77..f40972d6df90 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -868,12 +868,12 @@ skip_attr_list_load:
ni->itype.index.block_size);
goto unm_err_out;
}
- if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
+ if (ni->itype.index.block_size > PAGE_SIZE) {
ntfs_error(vi->i_sb, "Index block size (%u) > "
- "PAGE_CACHE_SIZE (%ld) is not "
+ "PAGE_SIZE (%ld) is not "
"supported. Sorry.",
ni->itype.index.block_size,
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
err = -EOPNOTSUPP;
goto unm_err_out;
}
@@ -1585,10 +1585,10 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
"two.", ni->itype.index.block_size);
goto unm_err_out;
}
- if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
- ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_CACHE_SIZE "
+ if (ni->itype.index.block_size > PAGE_SIZE) {
+ ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE "
"(%ld) is not supported. Sorry.",
- ni->itype.index.block_size, PAGE_CACHE_SIZE);
+ ni->itype.index.block_size, PAGE_SIZE);
err = -EOPNOTSUPP;
goto unm_err_out;
}
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 1711b710b641..27a24a42f712 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -283,15 +283,15 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
ntfs_unmap_page(page);
}
page = ntfs_map_page(mapping, last_read_pos >>
- PAGE_CACHE_SHIFT);
+ PAGE_SHIFT);
if (IS_ERR(page)) {
err = PTR_ERR(page);
ntfs_error(vol->sb, "Failed to map page.");
goto out;
}
- buf_size = last_read_pos & ~PAGE_CACHE_MASK;
+ buf_size = last_read_pos & ~PAGE_MASK;
buf = page_address(page) + buf_size;
- buf_size = PAGE_CACHE_SIZE - buf_size;
+ buf_size = PAGE_SIZE - buf_size;
if (unlikely(last_read_pos + buf_size > i_size))
buf_size = i_size - last_read_pos;
buf_size <<= 3;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index c71de292c5ad..9d71213ca81e 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -381,7 +381,7 @@ static int ntfs_check_and_load_restart_page(struct inode *vi,
* completely inside @rp, just copy it from there. Otherwise map all
* the required pages and copy the data from them.
*/
- size = PAGE_CACHE_SIZE - (pos & ~PAGE_CACHE_MASK);
+ size = PAGE_SIZE - (pos & ~PAGE_MASK);
if (size >= le32_to_cpu(rp->system_page_size)) {
memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
} else {
@@ -394,8 +394,8 @@ static int ntfs_check_and_load_restart_page(struct inode *vi,
/* Copy the remaining data one page at a time. */
have_read = size;
to_read = le32_to_cpu(rp->system_page_size) - size;
- idx = (pos + size) >> PAGE_CACHE_SHIFT;
- BUG_ON((pos + size) & ~PAGE_CACHE_MASK);
+ idx = (pos + size) >> PAGE_SHIFT;
+ BUG_ON((pos + size) & ~PAGE_MASK);
do {
page = ntfs_map_page(vi->i_mapping, idx);
if (IS_ERR(page)) {
@@ -406,7 +406,7 @@ static int ntfs_check_and_load_restart_page(struct inode *vi,
err = -EIO;
goto err_out;
}
- size = min_t(int, to_read, PAGE_CACHE_SIZE);
+ size = min_t(int, to_read, PAGE_SIZE);
memcpy((u8*)trp + have_read, page_address(page), size);
ntfs_unmap_page(page);
have_read += size;
@@ -509,11 +509,11 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
* log page size if the page cache size is between the default log page
* size and twice that.
*/
- if (PAGE_CACHE_SIZE >= DefaultLogPageSize && PAGE_CACHE_SIZE <=
+ if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <=
DefaultLogPageSize * 2)
log_page_size = DefaultLogPageSize;
else
- log_page_size = PAGE_CACHE_SIZE;
+ log_page_size = PAGE_SIZE;
log_page_mask = log_page_size - 1;
/*
* Use ntfs_ffs() instead of ffs() to enable the compiler to
@@ -539,7 +539,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
* to be empty.
*/
for (pos = 0; pos < size; pos <<= 1) {
- pgoff_t idx = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t idx = pos >> PAGE_SHIFT;
if (!page || page->index != idx) {
if (page)
ntfs_unmap_page(page);
@@ -550,7 +550,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
goto err_out;
}
}
- kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
+ kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK);
/*
* A non-empty block means the logfile is not empty while an
* empty block after a non-empty block has been encountered
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 3014a36a255b..37b2501caaa4 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -61,16 +61,16 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
* here if the volume was that big...
*/
index = (u64)ni->mft_no << vol->mft_record_size_bits >>
- PAGE_CACHE_SHIFT;
- ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+ PAGE_SHIFT;
+ ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
i_size = i_size_read(mft_vi);
/* The maximum valid index into the page cache for $MFT's data. */
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = i_size >> PAGE_SHIFT;
/* If the wanted index is out of bounds the mft record doesn't exist. */
if (unlikely(index >= end_index)) {
- if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
+ if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
vol->mft_record_size) {
page = ERR_PTR(-ENOENT);
ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
@@ -487,7 +487,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
}
/* Get the page containing the mirror copy of the mft record @m. */
page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
- (PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
+ (PAGE_SHIFT - vol->mft_record_size_bits));
if (IS_ERR(page)) {
ntfs_error(vol->sb, "Failed to map mft mirror page.");
err = PTR_ERR(page);
@@ -497,7 +497,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
BUG_ON(!PageUptodate(page));
ClearPageUptodate(page);
/* Offset of the mft mirror record inside the page. */
- page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+ page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
/* The address in the page of the mirror copy of the mft record @m. */
kmirr = page_address(page) + page_ofs;
/* Copy the mst protected mft record to the mirror. */
@@ -1178,8 +1178,8 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
for (; pass <= 2;) {
/* Cap size to pass_end. */
ofs = data_pos >> 3;
- page_ofs = ofs & ~PAGE_CACHE_MASK;
- size = PAGE_CACHE_SIZE - page_ofs;
+ page_ofs = ofs & ~PAGE_MASK;
+ size = PAGE_SIZE - page_ofs;
ll = ((pass_end + 7) >> 3) - ofs;
if (size > ll)
size = ll;
@@ -1190,7 +1190,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
*/
if (size) {
page = ntfs_map_page(mftbmp_mapping,
- ofs >> PAGE_CACHE_SHIFT);
+ ofs >> PAGE_SHIFT);
if (IS_ERR(page)) {
ntfs_error(vol->sb, "Failed to read mft "
"bitmap, aborting.");
@@ -1328,13 +1328,13 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
*/
ll = lcn >> 3;
page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
- ll >> PAGE_CACHE_SHIFT);
+ ll >> PAGE_SHIFT);
if (IS_ERR(page)) {
up_write(&mftbmp_ni->runlist.lock);
ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
return PTR_ERR(page);
}
- b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK);
+ b = (u8*)page_address(page) + (ll & ~PAGE_MASK);
tb = 1 << (lcn & 7ull);
down_write(&vol->lcnbmp_lock);
if (*b != 0xff && !(*b & tb)) {
@@ -2103,14 +2103,14 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
* The index into the page cache and the offset within the page cache
* page of the wanted mft record.
*/
- index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
- ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+ index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT;
+ ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
/* The maximum valid index into the page cache for $MFT's data. */
i_size = i_size_read(mft_vi);
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = i_size >> PAGE_SHIFT;
if (unlikely(index >= end_index)) {
if (unlikely(index > end_index || ofs + vol->mft_record_size >=
- (i_size & ~PAGE_CACHE_MASK))) {
+ (i_size & ~PAGE_MASK))) {
ntfs_error(vol->sb, "Tried to format non-existing mft "
"record 0x%llx.", (long long)mft_no);
return -ENOENT;
@@ -2515,8 +2515,8 @@ mft_rec_already_initialized:
* We now have allocated and initialized the mft record. Calculate the
* index of and the offset within the page cache page the record is in.
*/
- index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
- ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+ index = bit << vol->mft_record_size_bits >> PAGE_SHIFT;
+ ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK;
/* Read, map, and pin the page containing the mft record. */
page = ntfs_map_page(vol->mft_ino->i_mapping, index);
if (IS_ERR(page)) {
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index c581e26a350d..12de47b96ca9 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -43,7 +43,7 @@ typedef enum {
NTFS_MAX_NAME_LEN = 255,
NTFS_MAX_ATTR_NAME_LEN = 255,
NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */
- NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_CACHE_SIZE,
+ NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE,
} NTFS_CONSTANTS;
/* Global variables. */
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index d80e3315cab0..9793e68ba1dd 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c
@@ -48,7 +48,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
ntfs_error(vol->sb, "Quota inodes are not open.");
return false;
}
- mutex_lock(&vol->quota_q_ino->i_mutex);
+ inode_lock(vol->quota_q_ino);
ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
if (!ictx) {
ntfs_error(vol->sb, "Failed to get index context.");
@@ -98,7 +98,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
ntfs_index_entry_mark_dirty(ictx);
set_done:
ntfs_index_ctx_put(ictx);
- mutex_unlock(&vol->quota_q_ino->i_mutex);
+ inode_unlock(vol->quota_q_ino);
/*
* We set the flag so we do not try to mark the quotas out of date
* again on remount.
@@ -110,7 +110,7 @@ done:
err_out:
if (ictx)
ntfs_index_ctx_put(ictx);
- mutex_unlock(&vol->quota_q_ino->i_mutex);
+ inode_unlock(vol->quota_q_ino);
return false;
}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d1a853585b53..ecb49870a680 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -823,14 +823,14 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
vol->mft_record_size_bits, vol->mft_record_size_bits);
/*
- * We cannot support mft record sizes above the PAGE_CACHE_SIZE since
+ * We cannot support mft record sizes above the PAGE_SIZE since
* we store $MFT/$DATA, the table of mft records in the page cache.
*/
- if (vol->mft_record_size > PAGE_CACHE_SIZE) {
+ if (vol->mft_record_size > PAGE_SIZE) {
ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
- "PAGE_CACHE_SIZE on your system (%lu). "
+ "PAGE_SIZE on your system (%lu). "
"This is not supported. Sorry.",
- vol->mft_record_size, PAGE_CACHE_SIZE);
+ vol->mft_record_size, PAGE_SIZE);
return false;
}
/* We cannot support mft record sizes below the sector size. */
@@ -1096,7 +1096,7 @@ static bool check_mft_mirror(ntfs_volume *vol)
ntfs_debug("Entering.");
/* Compare contents of $MFT and $MFTMirr. */
- mrecs_per_page = PAGE_CACHE_SIZE / vol->mft_record_size;
+ mrecs_per_page = PAGE_SIZE / vol->mft_record_size;
BUG_ON(!mrecs_per_page);
BUG_ON(!vol->mftmirr_size);
mft_page = mirr_page = NULL;
@@ -1284,10 +1284,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
* Find the inode number for the hibernation file by looking up the
* filename hiberfil.sys in the root directory.
*/
- mutex_lock(&vol->root_ino->i_mutex);
+ inode_lock(vol->root_ino);
mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
&name);
- mutex_unlock(&vol->root_ino->i_mutex);
+ inode_unlock(vol->root_ino);
if (IS_ERR_MREF(mref)) {
ret = MREF_ERR(mref);
/* If the file does not exist, Windows is not hibernated. */
@@ -1377,10 +1377,10 @@ static bool load_and_init_quota(ntfs_volume *vol)
* Find the inode number for the quota file by looking up the filename
* $Quota in the extended system files directory $Extend.
*/
- mutex_lock(&vol->extend_ino->i_mutex);
+ inode_lock(vol->extend_ino);
mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
&name);
- mutex_unlock(&vol->extend_ino->i_mutex);
+ inode_unlock(vol->extend_ino);
if (IS_ERR_MREF(mref)) {
/*
* If the file does not exist, quotas are disabled and have
@@ -1460,10 +1460,10 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
* Find the inode number for the transaction log file by looking up the
* filename $UsnJrnl in the extended system files directory $Extend.
*/
- mutex_lock(&vol->extend_ino->i_mutex);
+ inode_lock(vol->extend_ino);
mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
&name);
- mutex_unlock(&vol->extend_ino->i_mutex);
+ inode_unlock(vol->extend_ino);
if (IS_ERR_MREF(mref)) {
/*
* If the file does not exist, transaction logging is disabled,
@@ -1615,20 +1615,20 @@ static bool load_and_init_attrdef(ntfs_volume *vol)
if (!vol->attrdef)
goto iput_failed;
index = 0;
- max_index = i_size >> PAGE_CACHE_SHIFT;
- size = PAGE_CACHE_SIZE;
+ max_index = i_size >> PAGE_SHIFT;
+ size = PAGE_SIZE;
while (index < max_index) {
/* Read the attrdef table and copy it into the linear buffer. */
read_partial_attrdef_page:
page = ntfs_map_page(ino->i_mapping, index);
if (IS_ERR(page))
goto free_iput_failed;
- memcpy((u8*)vol->attrdef + (index++ << PAGE_CACHE_SHIFT),
+ memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT),
page_address(page), size);
ntfs_unmap_page(page);
};
- if (size == PAGE_CACHE_SIZE) {
- size = i_size & ~PAGE_CACHE_MASK;
+ if (size == PAGE_SIZE) {
+ size = i_size & ~PAGE_MASK;
if (size)
goto read_partial_attrdef_page;
}
@@ -1684,20 +1684,20 @@ static bool load_and_init_upcase(ntfs_volume *vol)
if (!vol->upcase)
goto iput_upcase_failed;
index = 0;
- max_index = i_size >> PAGE_CACHE_SHIFT;
- size = PAGE_CACHE_SIZE;
+ max_index = i_size >> PAGE_SHIFT;
+ size = PAGE_SIZE;
while (index < max_index) {
/* Read the upcase table and copy it into the linear buffer. */
read_partial_upcase_page:
page = ntfs_map_page(ino->i_mapping, index);
if (IS_ERR(page))
goto iput_upcase_failed;
- memcpy((char*)vol->upcase + (index++ << PAGE_CACHE_SHIFT),
+ memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT),
page_address(page), size);
ntfs_unmap_page(page);
};
- if (size == PAGE_CACHE_SIZE) {
- size = i_size & ~PAGE_CACHE_MASK;
+ if (size == PAGE_SIZE) {
+ size = i_size & ~PAGE_MASK;
if (size)
goto read_partial_upcase_page;
}
@@ -2471,14 +2471,14 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
down_read(&vol->lcnbmp_lock);
/*
* Convert the number of bits into bytes rounded up, then convert into
- * multiples of PAGE_CACHE_SIZE, rounding up so that if we have one
+ * multiples of PAGE_SIZE, rounding up so that if we have one
* full and one partial page max_index = 2.
*/
- max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
+ max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+ /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
- max_index, PAGE_CACHE_SIZE / 4);
+ max_index, PAGE_SIZE / 4);
for (index = 0; index < max_index; index++) {
unsigned long *kaddr;
@@ -2491,7 +2491,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
if (IS_ERR(page)) {
ntfs_debug("read_mapping_page() error. Skipping "
"page (index 0x%lx).", index);
- nr_free -= PAGE_CACHE_SIZE * 8;
+ nr_free -= PAGE_SIZE * 8;
continue;
}
kaddr = kmap_atomic(page);
@@ -2503,9 +2503,9 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
* ntfs_readpage().
*/
nr_free -= bitmap_weight(kaddr,
- PAGE_CACHE_SIZE * BITS_PER_BYTE);
+ PAGE_SIZE * BITS_PER_BYTE);
kunmap_atomic(kaddr);
- page_cache_release(page);
+ put_page(page);
}
ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
/*
@@ -2547,9 +2547,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
pgoff_t index;
ntfs_debug("Entering.");
- /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
+ /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
- "0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
+ "0x%lx.", max_index, PAGE_SIZE / 4);
for (index = 0; index < max_index; index++) {
unsigned long *kaddr;
@@ -2562,7 +2562,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
if (IS_ERR(page)) {
ntfs_debug("read_mapping_page() error. Skipping "
"page (index 0x%lx).", index);
- nr_free -= PAGE_CACHE_SIZE * 8;
+ nr_free -= PAGE_SIZE * 8;
continue;
}
kaddr = kmap_atomic(page);
@@ -2574,9 +2574,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
* ntfs_readpage().
*/
nr_free -= bitmap_weight(kaddr,
- PAGE_CACHE_SIZE * BITS_PER_BYTE);
+ PAGE_SIZE * BITS_PER_BYTE);
kunmap_atomic(kaddr);
- page_cache_release(page);
+ put_page(page);
}
ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
index - 1);
@@ -2618,17 +2618,17 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
/* Type of filesystem. */
sfs->f_type = NTFS_SB_MAGIC;
/* Optimal transfer block size. */
- sfs->f_bsize = PAGE_CACHE_SIZE;
+ sfs->f_bsize = PAGE_SIZE;
/*
* Total data blocks in filesystem in units of f_bsize and since
* inodes are also stored in data blocs ($MFT is a file) this is just
* the total clusters.
*/
sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
/* Free data blocks in filesystem in units of f_bsize. */
size = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
- PAGE_CACHE_SHIFT;
+ PAGE_SHIFT;
if (size < 0LL)
size = 0LL;
/* Free blocks avail to non-superuser, same as above on NTFS. */
@@ -2639,11 +2639,11 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits;
/*
* Convert the maximum number of set bits into bytes rounded up, then
- * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we
+ * convert into multiples of PAGE_SIZE, rounding up so that if we
* have one full and one partial page max_index = 2.
*/
max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
- + 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT;
read_unlock_irqrestore(&mft_ni->size_lock, flags);
/* Number of inodes in filesystem (at this point in time). */
sfs->f_files = size;
@@ -2765,15 +2765,15 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
if (!parse_options(vol, (char*)opt))
goto err_out_now;
- /* We support sector sizes up to the PAGE_CACHE_SIZE. */
- if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+ /* We support sector sizes up to the PAGE_SIZE. */
+ if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) {
if (!silent)
ntfs_error(sb, "Device has unsupported sector size "
"(%i). The maximum supported sector "
"size on this architecture is %lu "
"bytes.",
bdev_logical_block_size(sb->s_bdev),
- PAGE_CACHE_SIZE);
+ PAGE_SIZE);
goto err_out_now;
}
/*
@@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void)
ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
sizeof(big_ntfs_inode), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
- ntfs_big_inode_init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ntfs_big_inode_init_once);
if (!ntfs_big_inode_cache) {
pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
goto big_inode_err_out;
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index ce210d4951a1..e27e6527912b 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -41,7 +41,8 @@ ocfs2-objs := \
quota_local.o \
quota_global.o \
xattr.o \
- acl.o
+ acl.o \
+ filecheck.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 86181d6526dc..e361d1a0ca09 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec);
static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
-static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
.eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
.eo_update_clusters = ocfs2_dinode_update_clusters,
@@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
}
-static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
.eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
.eo_update_clusters = ocfs2_xattr_value_update_clusters,
@@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
}
-static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
.eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
.eo_update_clusters = ocfs2_xattr_tree_update_clusters,
@@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
et->et_root_el = &dx_root->dr_list;
}
-static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
.eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
.eo_update_clusters = ocfs2_dx_root_update_clusters,
@@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
return CONTIG_NONE;
}
-static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
.eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
.eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
.eo_update_clusters = ocfs2_refcount_tree_update_clusters,
@@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
struct buffer_head *bh,
ocfs2_journal_access_func access,
void *obj,
- struct ocfs2_extent_tree_operations *ops)
+ const struct ocfs2_extent_tree_operations *ops)
{
et->et_ops = ops;
et->et_root_bh = bh;
@@ -2516,21 +2516,6 @@ static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_block *eb;
u32 range;
- /*
- * In normal tree rotation process, we will never touch the
- * tree branch above subtree_index and ocfs2_extend_rotate_transaction
- * doesn't reserve the credits for them either.
- *
- * But we do have a special case here which will update the rightmost
- * records for all the bh in the path.
- * So we have to allocate extra credits and access them.
- */
- ret = ocfs2_extend_trans(handle, subtree_index);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
mlog_errno(ret);
@@ -2956,7 +2941,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
right_path->p_node[subtree_root].bh->b_blocknr,
right_path->p_tree_depth);
- ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
orig_credits, left_path);
if (ret) {
mlog_errno(ret);
@@ -3029,21 +3014,9 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
-
ret = ocfs2_et_sanity_check(et);
if (ret)
goto out;
- /*
- * There's two ways we handle this depending on
- * whether path is the only existing one.
- */
- ret = ocfs2_extend_rotate_transaction(handle, 0,
- handle->h_buffer_credits,
- path);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
ret = ocfs2_journal_access_path(et->et_ci, handle, path);
if (ret) {
@@ -3641,6 +3614,14 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
*/
if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
le16_to_cpu(el->l_next_free_rec) == 1) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ right_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_remove_rightmost_path(handle, et,
right_path,
@@ -3679,6 +3660,14 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
* The merge code will need to create an empty
* extent to take the place of the newly
@@ -3727,6 +3716,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
*/
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
/* The merge left us with an empty extent, remove it. */
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
@@ -3748,6 +3746,15 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
goto out;
}
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
/*
* Error from this last rotate is not critical, so
@@ -3783,6 +3790,16 @@ static int ocfs2_try_to_merge_extent(handle_t *handle,
}
if (ctxt->c_split_covers_rec) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ ret = 0;
+ goto out;
+ }
+
/*
* The merge may have left an empty extent in
* our leaf. Try to rotate it away.
@@ -5342,6 +5359,15 @@ static int ocfs2_truncate_rec(handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ /* extend credit for ocfs2_remove_rightmost_path */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
if (ret) {
mlog_errno(ret);
@@ -5719,7 +5745,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
goto bail;
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
@@ -5776,7 +5802,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
out_commit:
ocfs2_commit_trans(osb, handle);
out:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
bail:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
@@ -5832,7 +5858,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+ BUG_ON(inode_trylock(tl_inode));
start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
@@ -5928,16 +5954,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, tl_bh);
- /* TODO: Perhaps we can calculate the bulk of the
- * credits up front rather than extending like
- * this. */
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
rec = tl->tl_recs[i];
start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
le32_to_cpu(rec.t_start));
@@ -5958,6 +5974,13 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
goto bail;
}
}
+
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
i--;
}
@@ -5980,7 +6003,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
struct ocfs2_dinode *di;
struct ocfs2_truncate_log *tl;
- BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+ BUG_ON(inode_trylock(tl_inode));
di = (struct ocfs2_dinode *) tl_bh->b_data;
@@ -6008,7 +6031,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out;
}
- mutex_lock(&data_alloc_inode->i_mutex);
+ inode_lock(data_alloc_inode);
status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
if (status < 0) {
@@ -6016,7 +6039,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -6035,7 +6058,7 @@ out_unlock:
ocfs2_inode_unlock(data_alloc_inode, 1);
out_mutex:
- mutex_unlock(&data_alloc_inode->i_mutex);
+ inode_unlock(data_alloc_inode);
iput(data_alloc_inode);
out:
@@ -6047,9 +6070,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
int status;
struct inode *tl_inode = osb->osb_tl_inode;
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
status = __ocfs2_flush_truncate_log(osb);
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
return status;
}
@@ -6079,7 +6102,7 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
if (cancel)
cancel_delayed_work(&osb->osb_truncate_log_wq);
- queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
}
}
@@ -6174,8 +6197,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
}
bail:
- if (tl_inode)
- iput(tl_inode);
+ iput(tl_inode);
brelse(tl_bh);
if (status < 0) {
@@ -6209,7 +6231,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
num_recs);
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
for(i = 0; i < num_recs; i++) {
if (ocfs2_truncate_log_needs_flush(osb)) {
status = __ocfs2_flush_truncate_log(osb);
@@ -6240,7 +6262,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
}
bail_up:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
return status;
}
@@ -6254,7 +6276,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
status = ocfs2_flush_truncate_log(osb);
if (status < 0)
@@ -6347,7 +6369,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
goto out;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
@@ -6396,7 +6418,7 @@ out_unlock:
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
out:
while(head) {
@@ -6440,7 +6462,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
handle_t *handle;
int ret = 0;
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
while (head) {
if (ocfs2_truncate_log_needs_flush(osb)) {
@@ -6472,7 +6494,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
}
}
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
while (head) {
/* Premature exit may have left some dangling items. */
@@ -6649,7 +6671,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
{
int i;
struct page *page;
- unsigned int from, to = PAGE_CACHE_SIZE;
+ unsigned int from, to = PAGE_SIZE;
struct super_block *sb = inode->i_sb;
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
@@ -6657,21 +6679,21 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
if (numpages == 0)
goto out;
- to = PAGE_CACHE_SIZE;
+ to = PAGE_SIZE;
for(i = 0; i < numpages; i++) {
page = pages[i];
- from = start & (PAGE_CACHE_SIZE - 1);
- if ((end >> PAGE_CACHE_SHIFT) == page->index)
- to = end & (PAGE_CACHE_SIZE - 1);
+ from = start & (PAGE_SIZE - 1);
+ if ((end >> PAGE_SHIFT) == page->index)
+ to = end & (PAGE_SIZE - 1);
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
+ BUG_ON(from > PAGE_SIZE);
+ BUG_ON(to > PAGE_SIZE);
ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
&phys);
- start = (page->index + 1) << PAGE_CACHE_SHIFT;
+ start = (page->index + 1) << PAGE_SHIFT;
}
out:
if (pages)
@@ -6690,7 +6712,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
numpages = 0;
last_page_bytes = PAGE_ALIGN(end);
- index = start >> PAGE_CACHE_SHIFT;
+ index = start >> PAGE_SHIFT;
do {
pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
if (!pages[numpages]) {
@@ -6701,7 +6723,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
numpages++;
index++;
- } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
+ } while (index < (last_page_bytes >> PAGE_SHIFT));
out:
if (ret != 0) {
@@ -6928,8 +6950,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* to do that now.
*/
if (!ocfs2_sparse_alloc(osb) &&
- PAGE_CACHE_SIZE < osb->s_clustersize)
- end = PAGE_CACHE_SIZE;
+ PAGE_SIZE < osb->s_clustersize)
+ end = PAGE_SIZE;
ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
if (ret) {
@@ -6949,8 +6971,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
goto out_unlock;
}
- page_end = PAGE_CACHE_SIZE;
- if (PAGE_CACHE_SIZE > osb->s_clustersize)
+ page_end = PAGE_SIZE;
+ if (PAGE_SIZE > osb->s_clustersize)
page_end = osb->s_clustersize;
for (i = 0; i < num_pages; i++)
@@ -7356,7 +7378,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
if (ret < 0) {
@@ -7423,7 +7445,7 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 0);
brelse(main_bm_bh);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
return ret;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fb09b97db162..f3dc1b0dfffc 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -54,7 +54,7 @@
*/
struct ocfs2_extent_tree_operations;
struct ocfs2_extent_tree {
- struct ocfs2_extent_tree_operations *et_ops;
+ const struct ocfs2_extent_tree_operations *et_ops;
struct buffer_head *et_root_bh;
struct ocfs2_extent_list *et_root_el;
struct ocfs2_caching_info *et_ci;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7f604727f487..ad1577348a92 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -234,7 +234,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
size = i_size_read(inode);
- if (size > PAGE_CACHE_SIZE ||
+ if (size > PAGE_SIZE ||
size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
ocfs2_error(inode->i_sb,
"Inode %llu has with inline data has bad size: %Lu\n",
@@ -247,7 +247,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
if (size)
memcpy(kaddr, di->id2.i_data.id_data, size);
/* Clear the remaining part of the page */
- memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
+ memset(kaddr + size, 0, PAGE_SIZE - size);
flush_dcache_page(page);
kunmap_atomic(kaddr);
@@ -282,7 +282,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+ loff_t start = (loff_t)page->index << PAGE_SHIFT;
int ret, unlock = 1;
trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
@@ -385,7 +385,7 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
* drop out in that case as it's not worth handling here.
*/
last = list_entry(pages->prev, struct page, lru);
- start = (loff_t)last->index << PAGE_CACHE_SHIFT;
+ start = (loff_t)last->index << PAGE_SHIFT;
if (start >= i_size_read(inode))
goto out_unlock;
@@ -499,153 +499,6 @@ bail:
return status;
}
-/*
- * TODO: Make this into a generic get_blocks function.
- *
- * From do_direct_io in direct-io.c:
- * "So what we do is to permit the ->get_blocks function to populate
- * bh.b_size with the size of IO which is permitted at this offset and
- * this i_blkbits."
- *
- * This function is called directly from get_more_blocks in direct-io.c.
- *
- * called like this: dio->get_blocks(dio->inode, fs_startblk,
- * fs_count, map_bh, dio->rw == WRITE);
- */
-static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
- u32 cpos = 0;
- int alloc_locked = 0;
- u64 p_blkno, inode_blocks, contig_blocks;
- unsigned int ext_flags;
- unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
- unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
- unsigned long len = bh_result->b_size;
- unsigned int clusters_to_alloc = 0, contig_clusters = 0;
-
- cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
-
- /* This function won't even be called if the request isn't all
- * nicely aligned and of the right size, so there's no need
- * for us to check any of that. */
-
- inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* This figures out the size of the next contiguous block, and
- * our logical offset */
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
- if (ret) {
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
-
- /* We should already CoW the refcounted extent in case of create. */
- BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
-
- /* allocate blocks if no p_blkno is found, and create == 1 */
- if (!p_blkno && create) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto bail;
- }
-
- alloc_locked = 1;
-
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
- /* fill hole, allocate blocks can't be larger than the size
- * of the hole */
- clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
- contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb,
- contig_blocks);
- if (clusters_to_alloc > contig_clusters)
- clusters_to_alloc = contig_clusters;
-
- /* allocate extent and insert them into the extent tree */
- ret = ocfs2_extend_allocation(inode, cpos,
- clusters_to_alloc, 0);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog_errno(ret);
- goto bail;
- }
-
- ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
- &contig_blocks, &ext_flags);
- if (ret < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
- (unsigned long long)iblock);
- ret = -EIO;
- goto bail;
- }
- set_buffer_new(bh_result);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
-
- /*
- * get_more_blocks() expects us to describe a hole by clearing
- * the mapped bit on bh_result().
- *
- * Consider an unwritten extent as a hole.
- */
- if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- map_bh(bh_result, inode->i_sb, p_blkno);
- else
- clear_buffer_mapped(bh_result);
-
- /* make sure we don't map more than max_blocks blocks here as
- that's all the kernel will handle at this point. */
- if (max_blocks < contig_blocks)
- contig_blocks = max_blocks;
- bh_result->b_size = contig_blocks << blocksize_bits;
-bail:
- if (alloc_locked)
- ocfs2_inode_unlock(inode, 1);
- return ret;
-}
-
-/*
- * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. We use the rw_lock DLM lock
- * to protect io on one node from truncation on another.
- */
-static void ocfs2_dio_end_io(struct kiocb *iocb,
- loff_t offset,
- ssize_t bytes,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- int level;
-
- /* this io's submitter should not have unlocked this before we could */
- BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
-
- if (ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
-
- mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
- /* Let rw unlock to be done later to protect append direct io write */
- if (offset + bytes <= i_size_read(inode)) {
- ocfs2_iocb_clear_rw_locked(iocb);
-
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
- }
-}
-
static int ocfs2_releasepage(struct page *page, gfp_t wait)
{
if (!page_has_buffers(page))
@@ -653,373 +506,17 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
-static int ocfs2_is_overwrite(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- int ret = 0;
- u32 v_cpos = 0;
- u32 p_cpos = 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
- return 1;
-
- return 0;
-}
-
-static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset,
- u64 zero_len, int cluster_align)
-{
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- int ret = 0;
-
- if (offset <= i_size_read(inode) || cluster_align)
- return 0;
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- u64 s = i_size_read(inode);
- sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) +
- (do_div(s, osb->s_clustersize) >> 9);
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector,
- zero_len >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
- }
-
- return ret;
-}
-
-static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb,
- struct inode *inode, loff_t offset)
-{
- u64 zero_start, zero_len, total_zero_len;
- u32 p_cpos = 0, clusters_to_add;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode));
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
- u32 size_div, offset_div;
- int ret = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- offset_div = do_div(o, osb->s_clustersize);
- size_div = do_div(s, osb->s_clustersize);
- }
-
- if (offset <= i_size_read(inode))
- return 0;
-
- clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) -
- ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode));
- total_zero_len = offset - i_size_read(inode);
- if (clusters_to_add)
- total_zero_len -= offset_div;
-
- /* Allocate clusters to fill out holes, and this is only needed
- * when we add more than one clusters. Otherwise the cluster will
- * be allocated during direct IO */
- if (clusters_to_add > 1) {
- ret = ocfs2_extend_allocation(inode,
- OCFS2_I(inode)->ip_clusters,
- clusters_to_add - 1, 0);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- while (total_zero_len) {
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters,
- &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) +
- size_div;
- zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) -
- size_div;
- zero_len = min(total_zero_len, zero_len);
-
- if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- zero_start >> 9, zero_len >> 9,
- GFP_NOFS, false);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- total_zero_len -= zero_len;
- v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div);
-
- /* Only at first iteration can be cluster not aligned.
- * So set size_div to 0 for the rest */
- size_div = 0;
- }
-
-out:
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
- struct iov_iter *iter,
- loff_t offset)
-{
- ssize_t ret = 0;
- ssize_t written = 0;
- bool orphaned = false;
- int is_overwrite = 0;
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
- size_t count = iter->count;
- journal_t *journal = osb->journal->j_journal;
- u64 zero_len_head, zero_len_tail;
- int cluster_align_head, cluster_align_tail;
- loff_t final_size = offset + count;
- int append_write = offset >= i_size_read(inode) ? 1 : 0;
- unsigned int num_clusters = 0;
- unsigned int ext_flags = 0;
-
- {
- u64 o = offset;
- u64 s = i_size_read(inode);
-
- zero_len_head = do_div(o, 1 << osb->s_clustersize_bits);
- cluster_align_head = !zero_len_head;
-
- zero_len_tail = osb->s_clustersize -
- do_div(s, osb->s_clustersize);
- if ((offset - i_size_read(inode)) < zero_len_tail)
- zero_len_tail = offset - i_size_read(inode);
- cluster_align_tail = !zero_len_tail;
- }
-
- /*
- * when final_size > inode->i_size, inode->i_size will be
- * updated after direct write, so add the inode to orphan
- * dir first.
- */
- if (final_size > i_size_read(inode)) {
- ret = ocfs2_add_inode_to_orphan(osb, inode);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- orphaned = true;
- }
-
- if (append_write) {
- ret = ocfs2_inode_lock(inode, NULL, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- /* zeroing out the previously allocated cluster tail
- * that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
- zero_len_tail, cluster_align_tail);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- } else {
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
- offset);
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- }
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
- if (is_overwrite < 0) {
- mlog_errno(is_overwrite);
- ret = is_overwrite;
- ocfs2_inode_unlock(inode, 1);
- goto clean_orphan;
- }
-
- ocfs2_inode_unlock(inode, 1);
- }
-
- written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
- offset, ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
- if ((written < 0) && (written != -EIOCBQUEUED)) {
- loff_t i_size = i_size_read(inode);
-
- if (offset + count > i_size) {
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- if (i_size == i_size_read(inode)) {
- ret = ocfs2_truncate_file(inode, di_bh,
- i_size);
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
- goto clean_orphan;
- }
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
-
- ret = jbd2_journal_force_commit(journal);
- if (ret < 0)
- mlog_errno(ret);
- }
- } else if (written > 0 && append_write && !is_overwrite &&
- !cluster_align_head) {
- /* zeroing out the allocated cluster head */
- u32 p_cpos = 0;
- u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
-
- ret = ocfs2_inode_lock(inode, NULL, 0);
- if (ret < 0) {
- mlog_errno(ret);
- goto clean_orphan;
- }
-
- ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
- &num_clusters, &ext_flags);
- if (ret < 0) {
- mlog_errno(ret);
- ocfs2_inode_unlock(inode, 0);
- goto clean_orphan;
- }
-
- BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
-
- ret = blkdev_issue_zeroout(osb->sb->s_bdev,
- (u64)p_cpos << (osb->s_clustersize_bits - 9),
- zero_len_head >> 9, GFP_NOFS, false);
- if (ret < 0)
- mlog_errno(ret);
-
- ocfs2_inode_unlock(inode, 0);
- }
-
-clean_orphan:
- if (orphaned) {
- int tmp_ret;
- int update_isize = written > 0 ? 1 : 0;
- loff_t end = update_isize ? offset + written : 0;
-
- tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- goto out;
- }
-
- tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
- update_isize, end);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(ret);
- brelse(di_bh);
- goto out;
- }
-
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
- tmp_ret = jbd2_journal_force_commit(journal);
- if (tmp_ret < 0) {
- ret = tmp_ret;
- mlog_errno(tmp_ret);
- }
- }
-
-out:
- if (ret >= 0)
- ret = written;
- return ret;
-}
-
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- loff_t offset)
-{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
-
- /*
- * Fallback to buffered I/O if we see an inode without
- * extents.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
- return 0;
-
- /* Fallback to buffered I/O if we are appending and
- * concurrent O_DIRECT writes are allowed.
- */
- if (i_size_read(inode) <= offset && !full_coherency)
- return 0;
-
- if (iov_iter_rw(iter) == READ)
- return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
- iter, offset,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
- else
- return ocfs2_direct_IO_write(iocb, iter, offset);
-}
-
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
u32 cpos,
unsigned int *start,
unsigned int *end)
{
- unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+ unsigned int cluster_start = 0, cluster_end = PAGE_SIZE;
- if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+ if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) {
unsigned int cpp;
- cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+ cpp = 1 << (PAGE_SHIFT - osb->s_clustersize_bits);
cluster_start = cpos % cpp;
cluster_start = cluster_start << osb->s_clustersize_bits;
@@ -1187,13 +684,20 @@ next_bh:
return ret;
}
-#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#if (PAGE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
#define OCFS2_MAX_CTXT_PAGES 1
#else
-#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE)
#endif
-#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+
+struct ocfs2_unwritten_extent {
+ struct list_head ue_node;
+ struct list_head ue_ip_node;
+ u32 ue_cpos;
+ u32 ue_phys;
+};
/*
* Describe the state of a single cluster to be written to.
@@ -1206,7 +710,7 @@ struct ocfs2_write_cluster_desc {
* filled.
*/
unsigned c_new;
- unsigned c_unwritten;
+ unsigned c_clear_unwritten;
unsigned c_needs_zero;
};
@@ -1218,6 +722,9 @@ struct ocfs2_write_ctxt {
/* First cluster allocated in a nonsparse extend */
u32 w_first_new_cpos;
+ /* Type of caller. Must be one of buffer, mmap, direct. */
+ ocfs2_write_type_t w_type;
+
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
/*
@@ -1266,6 +773,8 @@ struct ocfs2_write_ctxt {
struct buffer_head *w_di_bh;
struct ocfs2_cached_dealloc_ctxt w_dealloc;
+
+ struct list_head w_unwritten_list;
};
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
@@ -1276,7 +785,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
if (pages[i]) {
unlock_page(pages[i]);
mark_page_accessed(pages[i]);
- page_cache_release(pages[i]);
+ put_page(pages[i]);
}
}
}
@@ -1299,13 +808,30 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
}
}
mark_page_accessed(wc->w_target_page);
- page_cache_release(wc->w_target_page);
+ put_page(wc->w_target_page);
}
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
}
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+static void ocfs2_free_unwritten_list(struct inode *inode,
+ struct list_head *head)
{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
+
+ list_for_each_entry_safe(ue, tmp, head, ue_node) {
+ list_del(&ue->ue_node);
+ spin_lock(&oi->ip_lock);
+ list_del(&ue->ue_ip_node);
+ spin_unlock(&oi->ip_lock);
+ kfree(ue);
+ }
+}
+
+static void ocfs2_free_write_ctxt(struct inode *inode,
+ struct ocfs2_write_ctxt *wc)
+{
+ ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
ocfs2_unlock_pages(wc);
brelse(wc->w_di_bh);
kfree(wc);
@@ -1313,7 +839,8 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
struct ocfs2_super *osb, loff_t pos,
- unsigned len, struct buffer_head *di_bh)
+ unsigned len, ocfs2_write_type_t type,
+ struct buffer_head *di_bh)
{
u32 cend;
struct ocfs2_write_ctxt *wc;
@@ -1328,13 +855,15 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
wc->w_clen = cend - wc->w_cpos + 1;
get_bh(di_bh);
wc->w_di_bh = di_bh;
+ wc->w_type = type;
- if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+ if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits))
wc->w_large_pages = 1;
else
wc->w_large_pages = 0;
ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+ INIT_LIST_HEAD(&wc->w_unwritten_list);
*wcp = wc;
@@ -1391,16 +920,17 @@ static void ocfs2_write_failure(struct inode *inode,
loff_t user_pos, unsigned user_len)
{
int i;
- unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
+ unsigned from = user_pos & (PAGE_SIZE - 1),
to = user_pos + user_len;
struct page *tmppage;
- ocfs2_zero_new_buffers(wc->w_target_page, from, to);
+ if (wc->w_target_page)
+ ocfs2_zero_new_buffers(wc->w_target_page, from, to);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
- if (page_has_buffers(tmppage)) {
+ if (tmppage && page_has_buffers(tmppage)) {
if (ocfs2_should_order_data(inode))
ocfs2_jbd2_file_inode(wc->w_handle, inode);
@@ -1430,7 +960,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
(page_offset(page) <= user_pos));
if (page == wc->w_target_page) {
- map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+ map_from = user_pos & (PAGE_SIZE - 1);
map_to = map_from + user_len;
if (new)
@@ -1504,7 +1034,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
struct inode *inode = mapping->host;
loff_t last_byte;
- target_index = user_pos >> PAGE_CACHE_SHIFT;
+ target_index = user_pos >> PAGE_SHIFT;
/*
* Figure out how many pages we'll be manipulating here. For
@@ -1523,18 +1053,20 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
*/
last_byte = max(user_pos + user_len, i_size_read(inode));
BUG_ON(last_byte < 1);
- end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+ end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1;
if ((start + wc->w_num_pages) > end_index)
wc->w_num_pages = end_index - start;
} else {
wc->w_num_pages = 1;
start = target_index;
}
+ end_index = (user_pos + user_len - 1) >> PAGE_SHIFT;
for(i = 0; i < wc->w_num_pages; i++) {
index = start + i;
- if (index == target_index && mmap_page) {
+ if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_MMAP) {
/*
* ocfs2_pagemkwrite() is a little different
* and wants us to directly use the page
@@ -1550,9 +1082,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
goto out;
}
- page_cache_get(mmap_page);
+ get_page(mmap_page);
wc->w_pages[i] = mmap_page;
wc->w_target_locked = true;
+ } else if (index >= target_index && index <= end_index &&
+ wc->w_type == OCFS2_WRITE_DIRECT) {
+ /* Direct write has no mapping page. */
+ wc->w_pages[i] = NULL;
+ continue;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1577,19 +1114,20 @@ out:
* Prepare a single cluster for write one cluster into the file.
*/
static int ocfs2_write_cluster(struct address_space *mapping,
- u32 phys, unsigned int unwritten,
+ u32 *phys, unsigned int new,
+ unsigned int clear_unwritten,
unsigned int should_zero,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_write_ctxt *wc, u32 cpos,
loff_t user_pos, unsigned user_len)
{
- int ret, i, new;
- u64 v_blkno, p_blkno;
+ int ret, i;
+ u64 p_blkno;
struct inode *inode = mapping->host;
struct ocfs2_extent_tree et;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- new = phys == 0 ? 1 : 0;
if (new) {
u32 tmp_pos;
@@ -1599,9 +1137,9 @@ static int ocfs2_write_cluster(struct address_space *mapping,
*/
tmp_pos = cpos;
ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
- &tmp_pos, 1, 0, wc->w_di_bh,
- wc->w_handle, data_ac,
- meta_ac, NULL);
+ &tmp_pos, 1, !clear_unwritten,
+ wc->w_di_bh, wc->w_handle,
+ data_ac, meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
@@ -1618,11 +1156,11 @@ static int ocfs2_write_cluster(struct address_space *mapping,
mlog_errno(ret);
goto out;
}
- } else if (unwritten) {
+ } else if (clear_unwritten) {
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
wc->w_di_bh);
ret = ocfs2_mark_extent_written(inode, &et,
- wc->w_handle, cpos, 1, phys,
+ wc->w_handle, cpos, 1, *phys,
meta_ac, &wc->w_dealloc);
if (ret < 0) {
mlog_errno(ret);
@@ -1630,30 +1168,33 @@ static int ocfs2_write_cluster(struct address_space *mapping,
}
}
- if (should_zero)
- v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
- else
- v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
-
/*
* The only reason this should fail is due to an inability to
* find the extent added.
*/
- ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
- NULL);
+ ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
if (ret < 0) {
mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
- "at logical block %llu",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (unsigned long long)v_blkno);
+ "at logical cluster %u",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
goto out;
}
- BUG_ON(p_blkno == 0);
+ BUG_ON(*phys == 0);
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
+ if (!should_zero)
+ p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
for(i = 0; i < wc->w_num_pages; i++) {
int tmpret;
+ /* This is the direct io target page. */
+ if (wc->w_pages[i] == NULL) {
+ p_blkno++;
+ continue;
+ }
+
tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
wc->w_pages[i], cpos,
user_pos, user_len,
@@ -1700,8 +1241,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
if ((cluster_off + local_len) > osb->s_clustersize)
local_len = osb->s_clustersize - cluster_off;
- ret = ocfs2_write_cluster(mapping, desc->c_phys,
- desc->c_unwritten,
+ ret = ocfs2_write_cluster(mapping, &desc->c_phys,
+ desc->c_new,
+ desc->c_clear_unwritten,
desc->c_needs_zero,
data_ac, meta_ac,
wc, desc->c_cpos, pos, local_len);
@@ -1730,7 +1272,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
{
struct ocfs2_write_cluster_desc *desc;
- wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+ wc->w_target_from = pos & (PAGE_SIZE - 1);
wc->w_target_to = wc->w_target_from + len;
if (alloc == 0)
@@ -1767,8 +1309,68 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
&wc->w_target_to);
} else {
wc->w_target_from = 0;
- wc->w_target_to = PAGE_CACHE_SIZE;
+ wc->w_target_to = PAGE_SIZE;
+ }
+}
+
+/*
+ * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
+ * do the zero work. And should not to clear UNWRITTEN since it will be cleared
+ * by the direct io procedure.
+ * If this is a new extent that allocated by direct io, we should mark it in
+ * the ip_unwritten_list.
+ */
+static int ocfs2_unwritten_check(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ struct ocfs2_write_cluster_desc *desc)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
+ int ret = 0;
+
+ if (!desc->c_needs_zero)
+ return 0;
+
+retry:
+ spin_lock(&oi->ip_lock);
+ /* Needs not to zero no metter buffer or direct. The one who is zero
+ * the cluster is doing zero. And he will clear unwritten after all
+ * cluster io finished. */
+ list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
+ if (desc->c_cpos == ue->ue_cpos) {
+ BUG_ON(desc->c_new);
+ desc->c_needs_zero = 0;
+ desc->c_clear_unwritten = 0;
+ goto unlock;
+ }
}
+
+ if (wc->w_type != OCFS2_WRITE_DIRECT)
+ goto unlock;
+
+ if (new == NULL) {
+ spin_unlock(&oi->ip_lock);
+ new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
+ GFP_NOFS);
+ if (new == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto retry;
+ }
+ /* This direct write will doing zero. */
+ new->ue_cpos = desc->c_cpos;
+ new->ue_phys = desc->c_phys;
+ desc->c_clear_unwritten = 0;
+ list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
+ list_add_tail(&new->ue_node, &wc->w_unwritten_list);
+ new = NULL;
+unlock:
+ spin_unlock(&oi->ip_lock);
+out:
+ if (new)
+ kfree(new);
+ return ret;
}
/*
@@ -1846,14 +1448,21 @@ static int ocfs2_populate_write_desc(struct inode *inode,
if (phys == 0) {
desc->c_new = 1;
desc->c_needs_zero = 1;
+ desc->c_clear_unwritten = 1;
*clusters_to_alloc = *clusters_to_alloc + 1;
}
if (ext_flags & OCFS2_EXT_UNWRITTEN) {
- desc->c_unwritten = 1;
+ desc->c_clear_unwritten = 1;
desc->c_needs_zero = 1;
}
+ ret = ocfs2_unwritten_check(inode, wc, desc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
num_clusters--;
}
@@ -2016,8 +1625,10 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode,
if (ret)
mlog_errno(ret);
- wc->w_first_new_cpos =
- ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
+ /* There is no wc if this is call from direct. */
+ if (wc)
+ wc->w_first_new_cpos =
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
return ret;
}
@@ -2046,9 +1657,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
int ret = 0;
unsigned int truncated_clusters;
- mutex_lock(&osb->osb_tl_inode->i_mutex);
+ inode_lock(osb->osb_tl_inode);
truncated_clusters = osb->truncated_clusters;
- mutex_unlock(&osb->osb_tl_inode->i_mutex);
+ inode_unlock(osb->osb_tl_inode);
/*
* Check whether we can succeed in allocating if we free
@@ -2071,9 +1682,8 @@ out:
return ret;
}
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
{
@@ -2090,7 +1700,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
int try_free = 1, ret1;
try_again:
- ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+ ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
if (ret) {
mlog_errno(ret);
return ret;
@@ -2109,14 +1719,17 @@ try_again:
}
}
- if (ocfs2_sparse_alloc(osb))
- ret = ocfs2_zero_tail(inode, di_bh, pos);
- else
- ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
- wc);
- if (ret) {
- mlog_errno(ret);
- goto out;
+ /* Direct io change i_size late, should not zero tail here. */
+ if (type != OCFS2_WRITE_DIRECT) {
+ if (ocfs2_sparse_alloc(osb))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ len, wc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
}
ret = ocfs2_check_range_for_refcount(inode, pos, len);
@@ -2147,7 +1760,7 @@ try_again:
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(long long)i_size_read(inode),
le32_to_cpu(di->i_clusters),
- pos, len, flags, mmap_page,
+ pos, len, type, mmap_page,
clusters_to_alloc, extents_to_split);
/*
@@ -2177,17 +1790,17 @@ try_again:
credits = ocfs2_calc_extend_credits(inode->i_sb,
&di->id2.i_list);
-
- }
+ } else if (type == OCFS2_WRITE_DIRECT)
+ /* direct write needs not to start trans if no extents alloc. */
+ goto success;
/*
* We have to zero sparse allocated clusters, unwritten extent clusters,
* and non-sparse clusters we just extended. For non-sparse writes,
* we know zeros will only be needed in the first and/or last cluster.
*/
- if (clusters_to_alloc || extents_to_split ||
- (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
- wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+ if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+ wc->w_desc[wc->w_clen - 1].c_needs_zero))
cluster_of_pages = 1;
else
cluster_of_pages = 0;
@@ -2254,7 +1867,8 @@ try_again:
ocfs2_free_alloc_context(meta_ac);
success:
- *pagep = wc->w_target_page;
+ if (pagep)
+ *pagep = wc->w_target_page;
*fsdata = wc;
return 0;
out_quota:
@@ -2265,7 +1879,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out:
- ocfs2_free_write_ctxt(wc);
+ ocfs2_free_write_ctxt(inode, wc);
if (data_ac) {
ocfs2_free_alloc_context(data_ac);
@@ -2317,8 +1931,8 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
- fsdata, di_bh, NULL);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
+ pagep, fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
goto out_fail;
@@ -2367,7 +1981,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
struct page *page, void *fsdata)
{
int i, ret;
- unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned from, to, start = pos & (PAGE_SIZE - 1);
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_write_ctxt *wc = fsdata;
@@ -2375,12 +1989,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle;
struct page *tmppage;
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- copied = ret;
- mlog_errno(ret);
- goto out;
+ BUG_ON(!list_empty(&wc->w_unwritten_list));
+
+ if (handle) {
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
+ wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
}
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
@@ -2388,24 +2006,29 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
goto out_write_size;
}
- if (unlikely(copied < len)) {
+ if (unlikely(copied < len) && wc->w_target_page) {
if (!PageUptodate(wc->w_target_page))
copied = 0;
ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
start+len);
}
- flush_dcache_page(wc->w_target_page);
+ if (wc->w_target_page)
+ flush_dcache_page(wc->w_target_page);
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
+ /* This is the direct io target page. */
+ if (tmppage == NULL)
+ continue;
+
if (tmppage == wc->w_target_page) {
from = wc->w_target_from;
to = wc->w_target_to;
- BUG_ON(from > PAGE_CACHE_SIZE ||
- to > PAGE_CACHE_SIZE ||
+ BUG_ON(from > PAGE_SIZE ||
+ to > PAGE_SIZE ||
to < from);
} else {
/*
@@ -2414,29 +2037,33 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
* to flush their entire range.
*/
from = 0;
- to = PAGE_CACHE_SIZE;
+ to = PAGE_SIZE;
}
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode))
- ocfs2_jbd2_file_inode(wc->w_handle, inode);
+ if (handle && ocfs2_should_order_data(inode))
+ ocfs2_jbd2_file_inode(handle, inode);
block_commit_write(tmppage, from, to);
}
}
out_write_size:
- pos += copied;
- if (pos > i_size_read(inode)) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
- inode->i_blocks = ocfs2_inode_sector_count(inode);
- di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
- di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
- ocfs2_journal_dirty(handle, wc->w_di_bh);
+ /* Direct io do not update i_size here. */
+ if (wc->w_type != OCFS2_WRITE_DIRECT) {
+ pos += copied;
+ if (pos > i_size_read(inode)) {
+ i_size_write(inode, pos);
+ mark_inode_dirty(inode);
+ }
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ }
+ if (handle)
+ ocfs2_journal_dirty(handle, wc->w_di_bh);
out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
@@ -2446,7 +2073,8 @@ out:
*/
ocfs2_unlock_pages(wc);
- ocfs2_commit_trans(osb, handle);
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
ocfs2_run_deallocs(osb, &wc->w_dealloc);
@@ -2471,6 +2099,360 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
return ret;
}
+struct ocfs2_dio_write_ctxt {
+ struct list_head dw_zero_list;
+ unsigned dw_zero_count;
+ int dw_orphaned;
+ pid_t dw_writer_pid;
+};
+
+static struct ocfs2_dio_write_ctxt *
+ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
+{
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+
+ if (bh->b_private)
+ return bh->b_private;
+
+ dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
+ if (dwc == NULL)
+ return NULL;
+ INIT_LIST_HEAD(&dwc->dw_zero_list);
+ dwc->dw_zero_count = 0;
+ dwc->dw_orphaned = 0;
+ dwc->dw_writer_pid = task_pid_nr(current);
+ bh->b_private = dwc;
+ *alloc = 1;
+
+ return dwc;
+}
+
+static void ocfs2_dio_free_write_ctx(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc)
+{
+ ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
+ kfree(dwc);
+}
+
+/*
+ * TODO: Make this into a generic get_blocks function.
+ *
+ * From do_direct_io in direct-io.c:
+ * "So what we do is to permit the ->get_blocks function to populate
+ * bh.b_size with the size of IO which is permitted at this offset and
+ * this i_blkbits."
+ *
+ * This function is called directly from get_more_blocks in direct-io.c.
+ *
+ * called like this: dio->get_blocks(dio->inode, fs_startblk,
+ * fs_count, map_bh, dio->rw == WRITE);
+ */
+static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_write_ctxt *wc;
+ struct ocfs2_write_cluster_desc *desc = NULL;
+ struct ocfs2_dio_write_ctxt *dwc = NULL;
+ struct buffer_head *di_bh = NULL;
+ u64 p_blkno;
+ loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned len, total_len = bh_result->b_size;
+ int ret = 0, first_get_block = 0;
+
+ len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
+ len = min(total_len, len);
+
+ mlog(0, "get block of %lu at %llu:%u req %u\n",
+ inode->i_ino, pos, len, total_len);
+
+ /*
+ * Because we need to change file size in ocfs2_dio_end_io_write(), or
+ * we may need to add it to orphan dir. So can not fall to fast path
+ * while file size will be changed.
+ */
+ if (pos + total_len <= i_size_read(inode)) {
+ down_read(&oi->ip_alloc_sem);
+ /* This is the fast path for re-write. */
+ ret = ocfs2_get_block(inode, iblock, bh_result, create);
+
+ up_read(&oi->ip_alloc_sem);
+
+ if (buffer_mapped(bh_result) &&
+ !buffer_new(bh_result) &&
+ ret == 0)
+ goto out;
+
+ /* Clear state set by ocfs2_get_block. */
+ bh_result->b_state = 0;
+ }
+
+ dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
+ if (unlikely(dwc == NULL)) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
+ ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
+ !dwc->dw_orphaned) {
+ /*
+ * when we are going to alloc extents beyond file size, add the
+ * inode to orphan dir, so we can recall those spaces when
+ * system crashed during write.
+ */
+ ret = ocfs2_add_inode_to_orphan(osb, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ dwc->dw_orphaned = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_write(&oi->ip_alloc_sem);
+
+ if (first_get_block) {
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_tail(inode, di_bh, pos);
+ else
+ ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
+ total_len, NULL);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+ }
+
+ ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
+ OCFS2_WRITE_DIRECT, NULL,
+ (void **)&wc, di_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ desc = &wc->w_desc[0];
+
+ p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
+ BUG_ON(p_blkno == 0);
+ p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
+
+ map_bh(bh_result, inode->i_sb, p_blkno);
+ bh_result->b_size = len;
+ if (desc->c_needs_zero)
+ set_buffer_new(bh_result);
+
+ /* May sleep in end_io. It should not happen in a irq context. So defer
+ * it to dio work queue. */
+ set_buffer_defer_completion(bh_result);
+
+ if (!list_empty(&wc->w_unwritten_list)) {
+ struct ocfs2_unwritten_extent *ue = NULL;
+
+ ue = list_first_entry(&wc->w_unwritten_list,
+ struct ocfs2_unwritten_extent,
+ ue_node);
+ BUG_ON(ue->ue_cpos != desc->c_cpos);
+ /* The physical address may be 0, fill it. */
+ ue->ue_phys = desc->c_phys;
+
+ list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
+ dwc->dw_zero_count++;
+ }
+
+ ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+ BUG_ON(ret != len);
+ ret = 0;
+unlock:
+ up_write(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (ret < 0)
+ ret = -EIO;
+ return ret;
+}
+
+static void ocfs2_dio_end_io_write(struct inode *inode,
+ struct ocfs2_dio_write_ctxt *dwc,
+ loff_t offset,
+ ssize_t bytes)
+{
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_unwritten_extent *ue = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = NULL;
+ loff_t end = offset + bytes;
+ int ret = 0, credits = 0, locked = 0;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ /* We do clear unwritten, delete orphan, change i_size here. If neither
+ * of these happen, we can skip all this. */
+ if (list_empty(&dwc->dw_zero_list) &&
+ end <= i_size_read(inode) &&
+ !dwc->dw_orphaned)
+ goto out;
+
+ /* ocfs2_file_write_iter will get i_mutex, so we need not lock if we
+ * are in that context. */
+ if (dwc->dw_writer_pid != task_pid_nr(current)) {
+ mutex_lock(&inode->i_mutex);
+ locked = 1;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_write(&oi->ip_alloc_sem);
+
+ /* Delete orphan before acquire i_mutex. */
+ if (dwc->dw_orphaned) {
+ BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
+
+ end = end > i_size_read(inode) ? end : 0;
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
+ !!end, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+ di = (struct ocfs2_dinode *)di_bh;
+
+ ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
+ &data_ac, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock;
+ }
+
+ credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto unlock;
+ }
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto commit;
+ }
+
+ list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
+ ret = ocfs2_mark_extent_written(inode, &et, handle,
+ ue->ue_cpos, 1,
+ ue->ue_phys,
+ meta_ac, &dealloc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ if (end > i_size_read(inode)) {
+ ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+commit:
+ ocfs2_commit_trans(osb, handle);
+unlock:
+ up_write(&oi->ip_alloc_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+out:
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ ocfs2_run_deallocs(osb, &dealloc);
+ if (locked)
+ mutex_unlock(&inode->i_mutex);
+ ocfs2_dio_free_write_ctx(inode, dwc);
+}
+
+/*
+ * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
+ */
+static int ocfs2_dio_end_io(struct kiocb *iocb,
+ loff_t offset,
+ ssize_t bytes,
+ void *private)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ int level;
+
+ if (bytes <= 0)
+ return 0;
+
+ /* this io's submitter should not have unlocked this before we could */
+ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
+ if (private)
+ ocfs2_dio_end_io_write(inode, private, offset, bytes);
+
+ ocfs2_iocb_clear_rw_locked(iocb);
+
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+ return 0;
+}
+
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file)->i_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ loff_t end = offset + iter->count;
+ get_block_t *get_block;
+
+ /*
+ * Fallback to buffered I/O if we see an inode without
+ * extents.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
+
+ /* Fallback to buffered I/O if we do not support append dio. */
+ if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+ return 0;
+
+ if (iov_iter_rw(iter) == READ)
+ get_block = ocfs2_get_block;
+ else
+ get_block = ocfs2_dio_get_block;
+
+ return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+ iter, offset, get_block,
+ ocfs2_dio_end_io, NULL, 0);
+}
+
const struct address_space_operations ocfs2_aops = {
.readpage = ocfs2_readpage,
.readpages = ocfs2_readpages,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 24e496d6bdcd..b1c9f28a57b1 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -47,9 +47,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct file *filp,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+typedef enum {
+ OCFS2_WRITE_BUFFER = 0,
+ OCFS2_WRITE_DIRECT,
+ OCFS2_WRITE_MMAP,
+} ocfs2_write_type_t;
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
@@ -79,7 +84,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
enum ocfs2_iocb_lock_bits {
OCFS2_IOCB_RW_LOCK = 0,
OCFS2_IOCB_RW_LOCK_LEVEL,
- OCFS2_IOCB_UNALIGNED_IO,
OCFS2_IOCB_NUM_LOCKS
};
@@ -88,11 +92,4 @@ enum ocfs2_iocb_lock_bits {
#define ocfs2_iocb_rw_locked_level(iocb) \
test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_unaligned_aio(iocb) \
- set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_clear_unaligned_aio(iocb) \
- clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_is_unaligned_aio(iocb) \
- test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ddddef0021a0..1934abb6b680 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -287,7 +287,6 @@ struct o2hb_bio_wait_ctxt {
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
- unsigned long flags;
struct o2hb_region *reg =
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
@@ -297,14 +296,14 @@ static void o2hb_write_timeout(struct work_struct *work)
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
if (o2hb_global_heartbeat_active()) {
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
failed = bitmap_weight(o2hb_failed_region_bitmap,
O2NM_MAX_REGIONS);
quorum = bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS);
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
quorum, failed);
@@ -418,13 +417,13 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
- vec_start = (cs << bits) % PAGE_CACHE_SIZE;
+ vec_start = (cs << bits) % PAGE_SIZE;
while(cs < max_slots) {
current_page = cs / spp;
page = reg->hr_slot_data[current_page];
- vec_len = min(PAGE_CACHE_SIZE - vec_start,
- (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
+ vec_len = min(PAGE_SIZE - vec_start,
+ (max_slots-cs) * (PAGE_SIZE/spp) );
mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
current_page, vec_len, vec_start);
@@ -432,7 +431,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
len = bio_add_page(bio, page, vec_len, vec_start);
if (len != vec_len) break;
- cs += vec_len / (PAGE_CACHE_SIZE/spp);
+ cs += vec_len / (PAGE_SIZE/spp);
vec_start = 0;
}
@@ -1254,15 +1253,15 @@ static const struct file_operations o2hb_debug_fops = {
void o2hb_exit(void)
{
- kfree(o2hb_db_livenodes);
- kfree(o2hb_db_liveregions);
- kfree(o2hb_db_quorumregions);
- kfree(o2hb_db_failedregions);
debugfs_remove(o2hb_debug_failedregions);
debugfs_remove(o2hb_debug_quorumregions);
debugfs_remove(o2hb_debug_liveregions);
debugfs_remove(o2hb_debug_livenodes);
debugfs_remove(o2hb_debug_dir);
+ kfree(o2hb_db_livenodes);
+ kfree(o2hb_db_liveregions);
+ kfree(o2hb_db_quorumregions);
+ kfree(o2hb_db_failedregions);
}
static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
@@ -1438,13 +1437,15 @@ static void o2hb_region_release(struct config_item *item)
kfree(reg->hr_slots);
- kfree(reg->hr_db_regnum);
- kfree(reg->hr_db_livenodes);
debugfs_remove(reg->hr_debug_livenodes);
debugfs_remove(reg->hr_debug_regnum);
debugfs_remove(reg->hr_debug_elapsed_time);
debugfs_remove(reg->hr_debug_pinned);
debugfs_remove(reg->hr_debug_dir);
+ kfree(reg->hr_db_livenodes);
+ kfree(reg->hr_db_regnum);
+ kfree(reg->hr_db_elapsed_time);
+ kfree(reg->hr_db_pinned);
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
@@ -1480,16 +1481,17 @@ static int o2hb_read_block_input(struct o2hb_region *reg,
return 0;
}
-static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
+static ssize_t o2hb_region_block_bytes_show(struct config_item *item,
char *page)
{
- return sprintf(page, "%u\n", reg->hr_block_bytes);
+ return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes);
}
-static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
int status;
unsigned long block_bytes;
unsigned int block_bits;
@@ -1508,16 +1510,17 @@ static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
+static ssize_t o2hb_region_start_block_show(struct config_item *item,
char *page)
{
- return sprintf(page, "%llu\n", reg->hr_start_block);
+ return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block);
}
-static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_start_block_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
unsigned long long tmp;
char *p = (char *)page;
@@ -1533,16 +1536,16 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", reg->hr_blocks);
+ return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks);
}
-static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_blocks_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
unsigned long tmp;
char *p = (char *)page;
@@ -1561,20 +1564,19 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
return count;
}
-static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
{
unsigned int ret = 0;
- if (reg->hr_bdev)
- ret = sprintf(page, "%s\n", reg->hr_dev_name);
+ if (to_o2hb_region(item)->hr_bdev)
+ ret = sprintf(page, "%s\n", to_o2hb_region(item)->hr_dev_name);
return ret;
}
static void o2hb_init_region_params(struct o2hb_region *reg)
{
- reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
+ reg->hr_slots_per_page = PAGE_SIZE >> reg->hr_block_bits;
reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
@@ -1677,10 +1679,11 @@ out:
}
/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
-static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
+static ssize_t o2hb_region_dev_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
struct task_struct *hb_task;
long fd;
int sectsize;
@@ -1778,8 +1781,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
}
++live_threshold;
atomic_set(&reg->hr_steady_iterations, live_threshold);
- /* unsteady_iterations is double the steady_iterations */
- atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
+ /* unsteady_iterations is triple the steady_iterations */
+ atomic_set(&reg->hr_unsteady_iterations, (live_threshold * 3));
hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
reg->hr_item.ci_name);
@@ -1841,9 +1844,9 @@ out:
return ret;
}
-static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
- char *page)
+static ssize_t o2hb_region_pid_show(struct config_item *item, char *page)
{
+ struct o2hb_region *reg = to_o2hb_region(item);
pid_t pid = 0;
spin_lock(&o2hb_live_lock);
@@ -1857,92 +1860,23 @@ static ssize_t o2hb_region_pid_read(struct o2hb_region *reg,
return sprintf(page, "%u\n", pid);
}
-struct o2hb_region_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2hb_region *, char *);
- ssize_t (*store)(struct o2hb_region *, const char *, size_t);
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "block_bytes",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_block_bytes_read,
- .store = o2hb_region_block_bytes_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_start_block = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "start_block",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_start_block_read,
- .store = o2hb_region_start_block_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_blocks = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "blocks",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_blocks_read,
- .store = o2hb_region_blocks_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_dev = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "dev",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_region_dev_read,
- .store = o2hb_region_dev_write,
-};
-
-static struct o2hb_region_attribute o2hb_region_attr_pid = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "pid",
- .ca_mode = S_IRUGO | S_IRUSR },
- .show = o2hb_region_pid_read,
-};
+CONFIGFS_ATTR(o2hb_region_, block_bytes);
+CONFIGFS_ATTR(o2hb_region_, start_block);
+CONFIGFS_ATTR(o2hb_region_, blocks);
+CONFIGFS_ATTR(o2hb_region_, dev);
+CONFIGFS_ATTR_RO(o2hb_region_, pid);
static struct configfs_attribute *o2hb_region_attrs[] = {
- &o2hb_region_attr_block_bytes.attr,
- &o2hb_region_attr_start_block.attr,
- &o2hb_region_attr_blocks.attr,
- &o2hb_region_attr_dev.attr,
- &o2hb_region_attr_pid.attr,
+ &o2hb_region_attr_block_bytes,
+ &o2hb_region_attr_start_block,
+ &o2hb_region_attr_blocks,
+ &o2hb_region_attr_dev,
+ &o2hb_region_attr_pid,
NULL,
};
-static ssize_t o2hb_region_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2hb_region *reg = to_o2hb_region(item);
- struct o2hb_region_attribute *o2hb_region_attr =
- container_of(attr, struct o2hb_region_attribute, attr);
- ssize_t ret = 0;
-
- if (o2hb_region_attr->show)
- ret = o2hb_region_attr->show(reg, page);
- return ret;
-}
-
-static ssize_t o2hb_region_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2hb_region *reg = to_o2hb_region(item);
- struct o2hb_region_attribute *o2hb_region_attr =
- container_of(attr, struct o2hb_region_attribute, attr);
- ssize_t ret = -EINVAL;
-
- if (o2hb_region_attr->store)
- ret = o2hb_region_attr->store(reg, page, count);
- return ret;
-}
-
static struct configfs_item_operations o2hb_region_item_ops = {
.release = o2hb_region_release,
- .show_attribute = o2hb_region_show,
- .store_attribute = o2hb_region_store,
};
static struct config_item_type o2hb_region_type = {
@@ -2137,49 +2071,14 @@ unlock:
spin_unlock(&o2hb_live_lock);
}
-struct o2hb_heartbeat_group_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
- ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
-};
-
-static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
- struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
- container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
- ssize_t ret = 0;
-
- if (o2hb_heartbeat_group_attr->show)
- ret = o2hb_heartbeat_group_attr->show(reg, page);
- return ret;
-}
-
-static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
- struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
- container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
- ssize_t ret = -EINVAL;
-
- if (o2hb_heartbeat_group_attr->store)
- ret = o2hb_heartbeat_group_attr->store(reg, page, count);
- return ret;
-}
-
-static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
- char *page)
+static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+ char *page)
{
return sprintf(page, "%u\n", o2hb_dead_threshold);
}
-static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
- const char *page,
- size_t count)
+static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+ const char *page, size_t count)
{
unsigned long tmp;
char *p = (char *)page;
@@ -2194,17 +2093,15 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
return count;
}
-static
-ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
- char *page)
+static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item,
+ char *page)
{
return sprintf(page, "%s\n",
o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
}
-static
-ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
- const char *page, size_t count)
+static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
+ const char *page, size_t count)
{
unsigned int i;
int ret;
@@ -2229,33 +2126,15 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
}
-static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "dead_threshold",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_heartbeat_group_threshold_show,
- .store = o2hb_heartbeat_group_threshold_store,
-};
-
-static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "mode",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2hb_heartbeat_group_mode_show,
- .store = o2hb_heartbeat_group_mode_store,
-};
+CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
- &o2hb_heartbeat_group_attr_threshold.attr,
- &o2hb_heartbeat_group_attr_mode.attr,
+ &o2hb_heartbeat_group_attr_threshold,
+ &o2hb_heartbeat_group_attr_mode,
NULL,
};
-static struct configfs_item_operations o2hb_heartbeat_group_item_ops = {
- .show_attribute = o2hb_heartbeat_group_show,
- .store_attribute = o2hb_heartbeat_group_store,
-};
-
static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
.make_item = o2hb_heartbeat_group_make_item,
.drop_item = o2hb_heartbeat_group_drop_item,
@@ -2263,7 +2142,6 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
static struct config_item_type o2hb_heartbeat_group_type = {
.ct_group_ops = &o2hb_heartbeat_group_group_ops,
- .ct_item_ops = &o2hb_heartbeat_group_item_ops,
.ct_attrs = o2hb_heartbeat_group_attrs,
.ct_owner = THIS_MODULE,
};
@@ -2546,11 +2424,10 @@ EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
int o2hb_check_node_heartbeating_no_sem(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
- unsigned long flags;
- spin_lock_irqsave(&o2hb_live_lock, flags);
+ spin_lock(&o2hb_live_lock);
o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
- spin_unlock_irqrestore(&o2hb_live_lock, flags);
+ spin_unlock(&o2hb_live_lock);
if (!test_bit(node_num, testing_map)) {
mlog(ML_HEARTBEAT,
"node (%u) does not have heartbeating enabled.\n",
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 441c84e169e6..b17d180bdc16 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -172,9 +172,9 @@ static void o2nm_node_release(struct config_item *item)
kfree(node);
}
-static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_num_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", node->nd_num);
+ return sprintf(page, "%d\n", to_o2nm_node(item)->nd_num);
}
static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
@@ -188,15 +188,16 @@ enum {
O2NM_NODE_ATTR_NUM = 0,
O2NM_NODE_ATTR_PORT,
O2NM_NODE_ATTR_ADDRESS,
- O2NM_NODE_ATTR_LOCAL,
};
-static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
+static ssize_t o2nm_node_num_store(struct config_item *item, const char *page,
size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
unsigned long tmp;
char *p = (char *)page;
+ int ret = 0;
tmp = simple_strtoul(p, &p, 0);
if (!p || (*p && (*p != '\n')))
@@ -215,26 +216,30 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
write_lock(&cluster->cl_nodes_lock);
if (cluster->cl_nodes[tmp])
- p = NULL;
+ ret = -EEXIST;
+ else if (test_and_set_bit(O2NM_NODE_ATTR_NUM,
+ &node->nd_set_attributes))
+ ret = -EBUSY;
else {
cluster->cl_nodes[tmp] = node;
node->nd_num = tmp;
set_bit(tmp, cluster->cl_nodes_bitmap);
}
write_unlock(&cluster->cl_nodes_lock);
- if (p == NULL)
- return -EEXIST;
+ if (ret)
+ return ret;
return count;
}
-static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_ipv4_port_show(struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
+ return sprintf(page, "%u\n", ntohs(to_o2nm_node(item)->nd_ipv4_port));
}
-static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
+static ssize_t o2nm_node_ipv4_port_store(struct config_item *item,
const char *page, size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
unsigned long tmp;
char *p = (char *)page;
@@ -247,20 +252,23 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
if (tmp >= (u16)-1)
return -ERANGE;
+ if (test_and_set_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
+ return -EBUSY;
node->nd_ipv4_port = htons(tmp);
return count;
}
-static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_ipv4_address_show(struct config_item *item, char *page)
{
- return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
+ return sprintf(page, "%pI4\n", &to_o2nm_node(item)->nd_ipv4_address);
}
-static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
+static ssize_t o2nm_node_ipv4_address_store(struct config_item *item,
const char *page,
size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
int ret, i;
struct rb_node **p, *parent;
@@ -282,6 +290,9 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
write_lock(&cluster->cl_nodes_lock);
if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
ret = -EEXIST;
+ else if (test_and_set_bit(O2NM_NODE_ATTR_ADDRESS,
+ &node->nd_set_attributes))
+ ret = -EBUSY;
else {
rb_link_node(&node->nd_ip_node, parent, p);
rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
@@ -295,14 +306,15 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
return count;
}
-static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
+static ssize_t o2nm_node_local_show(struct config_item *item, char *page)
{
- return sprintf(page, "%d\n", node->nd_local);
+ return sprintf(page, "%d\n", to_o2nm_node(item)->nd_local);
}
-static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
+static ssize_t o2nm_node_local_store(struct config_item *item, const char *page,
size_t count)
{
+ struct o2nm_node *node = to_o2nm_node(item);
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
unsigned long tmp;
char *p = (char *)page;
@@ -349,108 +361,21 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
return count;
}
-struct o2nm_node_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2nm_node *, char *);
- ssize_t (*store)(struct o2nm_node *, const char *, size_t);
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_num = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "num",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_num_read,
- .store = o2nm_node_num_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "ipv4_port",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_ipv4_port_read,
- .store = o2nm_node_ipv4_port_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "ipv4_address",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_ipv4_address_read,
- .store = o2nm_node_ipv4_address_write,
-};
-
-static struct o2nm_node_attribute o2nm_node_attr_local = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "local",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_node_local_read,
- .store = o2nm_node_local_write,
-};
+CONFIGFS_ATTR(o2nm_node_, num);
+CONFIGFS_ATTR(o2nm_node_, ipv4_port);
+CONFIGFS_ATTR(o2nm_node_, ipv4_address);
+CONFIGFS_ATTR(o2nm_node_, local);
static struct configfs_attribute *o2nm_node_attrs[] = {
- [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
- [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
- [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
- [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
+ &o2nm_node_attr_num,
+ &o2nm_node_attr_ipv4_port,
+ &o2nm_node_attr_ipv4_address,
+ &o2nm_node_attr_local,
NULL,
};
-static int o2nm_attr_index(struct configfs_attribute *attr)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
- if (attr == o2nm_node_attrs[i])
- return i;
- }
- BUG();
- return 0;
-}
-
-static ssize_t o2nm_node_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2nm_node *node = to_o2nm_node(item);
- struct o2nm_node_attribute *o2nm_node_attr =
- container_of(attr, struct o2nm_node_attribute, attr);
- ssize_t ret = 0;
-
- if (o2nm_node_attr->show)
- ret = o2nm_node_attr->show(node, page);
- return ret;
-}
-
-static ssize_t o2nm_node_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2nm_node *node = to_o2nm_node(item);
- struct o2nm_node_attribute *o2nm_node_attr =
- container_of(attr, struct o2nm_node_attribute, attr);
- ssize_t ret;
- int attr_index = o2nm_attr_index(attr);
-
- if (o2nm_node_attr->store == NULL) {
- ret = -EINVAL;
- goto out;
- }
-
- if (test_bit(attr_index, &node->nd_set_attributes))
- return -EBUSY;
-
- ret = o2nm_node_attr->store(node, page, count);
- if (ret < count)
- goto out;
-
- set_bit(attr_index, &node->nd_set_attributes);
-out:
- return ret;
-}
-
static struct configfs_item_operations o2nm_node_item_ops = {
.release = o2nm_node_release,
- .show_attribute = o2nm_node_show,
- .store_attribute = o2nm_node_store,
};
static struct config_item_type o2nm_node_type = {
@@ -475,12 +400,6 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
}
#endif
-struct o2nm_cluster_attribute {
- struct configfs_attribute attr;
- ssize_t (*show)(struct o2nm_cluster *, char *);
- ssize_t (*store)(struct o2nm_cluster *, const char *, size_t);
-};
-
static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
unsigned int *val)
{
@@ -501,15 +420,16 @@ static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count,
return count;
}
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_idle_timeout_ms_show(struct config_item *item,
+ char *page)
{
- return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
+ return sprintf(page, "%u\n", to_o2nm_cluster(item)->cl_idle_timeout_ms);
}
-static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_idle_timeout_ms_store(struct config_item *item,
+ const char *page, size_t count)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret;
unsigned int val;
@@ -536,15 +456,17 @@ static ssize_t o2nm_cluster_attr_idle_timeout_ms_write(
return ret;
}
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_keepalive_delay_ms_show(
+ struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
+ return sprintf(page, "%u\n",
+ to_o2nm_cluster(item)->cl_keepalive_delay_ms);
}
-static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_keepalive_delay_ms_store(
+ struct config_item *item, const char *page, size_t count)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret;
unsigned int val;
@@ -571,22 +493,24 @@ static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write(
return ret;
}
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_reconnect_delay_ms_show(
+ struct config_item *item, char *page)
{
- return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
+ return sprintf(page, "%u\n",
+ to_o2nm_cluster(item)->cl_reconnect_delay_ms);
}
-static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_reconnect_delay_ms_store(
+ struct config_item *item, const char *page, size_t count)
{
return o2nm_cluster_attr_write(page, count,
- &cluster->cl_reconnect_delay_ms);
+ &to_o2nm_cluster(item)->cl_reconnect_delay_ms);
}
-static ssize_t o2nm_cluster_attr_fence_method_read(
- struct o2nm_cluster *cluster, char *page)
+static ssize_t o2nm_cluster_fence_method_show(
+ struct config_item *item, char *page)
{
+ struct o2nm_cluster *cluster = to_o2nm_cluster(item);
ssize_t ret = 0;
if (cluster)
@@ -595,8 +519,8 @@ static ssize_t o2nm_cluster_attr_fence_method_read(
return ret;
}
-static ssize_t o2nm_cluster_attr_fence_method_write(
- struct o2nm_cluster *cluster, const char *page, size_t count)
+static ssize_t o2nm_cluster_fence_method_store(
+ struct config_item *item, const char *page, size_t count)
{
unsigned int i;
@@ -608,10 +532,10 @@ static ssize_t o2nm_cluster_attr_fence_method_write(
continue;
if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
continue;
- if (cluster->cl_fence_method != i) {
+ if (to_o2nm_cluster(item)->cl_fence_method != i) {
printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
o2nm_fence_method_desc[i]);
- cluster->cl_fence_method = i;
+ to_o2nm_cluster(item)->cl_fence_method = i;
}
return count;
}
@@ -620,79 +544,18 @@ bail:
return -EINVAL;
}
-static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "idle_timeout_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_idle_timeout_ms_read,
- .store = o2nm_cluster_attr_idle_timeout_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "keepalive_delay_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_keepalive_delay_ms_read,
- .store = o2nm_cluster_attr_keepalive_delay_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "reconnect_delay_ms",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_reconnect_delay_ms_read,
- .store = o2nm_cluster_attr_reconnect_delay_ms_write,
-};
-
-static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
- .attr = { .ca_owner = THIS_MODULE,
- .ca_name = "fence_method",
- .ca_mode = S_IRUGO | S_IWUSR },
- .show = o2nm_cluster_attr_fence_method_read,
- .store = o2nm_cluster_attr_fence_method_write,
-};
+CONFIGFS_ATTR(o2nm_cluster_, idle_timeout_ms);
+CONFIGFS_ATTR(o2nm_cluster_, keepalive_delay_ms);
+CONFIGFS_ATTR(o2nm_cluster_, reconnect_delay_ms);
+CONFIGFS_ATTR(o2nm_cluster_, fence_method);
static struct configfs_attribute *o2nm_cluster_attrs[] = {
- &o2nm_cluster_attr_idle_timeout_ms.attr,
- &o2nm_cluster_attr_keepalive_delay_ms.attr,
- &o2nm_cluster_attr_reconnect_delay_ms.attr,
- &o2nm_cluster_attr_fence_method.attr,
+ &o2nm_cluster_attr_idle_timeout_ms,
+ &o2nm_cluster_attr_keepalive_delay_ms,
+ &o2nm_cluster_attr_reconnect_delay_ms,
+ &o2nm_cluster_attr_fence_method,
NULL,
};
-static ssize_t o2nm_cluster_show(struct config_item *item,
- struct configfs_attribute *attr,
- char *page)
-{
- struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- struct o2nm_cluster_attribute *o2nm_cluster_attr =
- container_of(attr, struct o2nm_cluster_attribute, attr);
- ssize_t ret = 0;
-
- if (o2nm_cluster_attr->show)
- ret = o2nm_cluster_attr->show(cluster, page);
- return ret;
-}
-
-static ssize_t o2nm_cluster_store(struct config_item *item,
- struct configfs_attribute *attr,
- const char *page, size_t count)
-{
- struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- struct o2nm_cluster_attribute *o2nm_cluster_attr =
- container_of(attr, struct o2nm_cluster_attribute, attr);
- ssize_t ret;
-
- if (o2nm_cluster_attr->store == NULL) {
- ret = -EINVAL;
- goto out;
- }
-
- ret = o2nm_cluster_attr->store(cluster, page, count);
- if (ret < count)
- goto out;
-out:
- return ret;
-}
static struct config_item *o2nm_node_group_make_item(struct config_group *group,
const char *name)
@@ -767,14 +630,11 @@ static void o2nm_cluster_release(struct config_item *item)
{
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- kfree(cluster->cl_group.default_groups);
kfree(cluster);
}
static struct configfs_item_operations o2nm_cluster_item_ops = {
.release = o2nm_cluster_release,
- .show_attribute = o2nm_cluster_show,
- .store_attribute = o2nm_cluster_store,
};
static struct config_item_type o2nm_cluster_type = {
@@ -805,7 +665,6 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
struct o2nm_cluster *cluster = NULL;
struct o2nm_node_group *ns = NULL;
struct config_group *o2hb_group = NULL, *ret = NULL;
- void *defs = NULL;
/* this runs under the parent dir's i_mutex; there can be only
* one caller in here at a time */
@@ -814,20 +673,18 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
- defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
o2hb_group = o2hb_alloc_hb_set();
- if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
+ if (cluster == NULL || ns == NULL || o2hb_group == NULL)
goto out;
config_group_init_type_name(&cluster->cl_group, name,
&o2nm_cluster_type);
+ configfs_add_default_group(&ns->ns_group, &cluster->cl_group);
+
config_group_init_type_name(&ns->ns_group, "node",
&o2nm_node_group_type);
+ configfs_add_default_group(o2hb_group, &cluster->cl_group);
- cluster->cl_group.default_groups = defs;
- cluster->cl_group.default_groups[0] = &ns->ns_group;
- cluster->cl_group.default_groups[1] = o2hb_group;
- cluster->cl_group.default_groups[2] = NULL;
rwlock_init(&cluster->cl_nodes_lock);
cluster->cl_node_ip_tree = RB_ROOT;
cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
@@ -843,7 +700,6 @@ out:
kfree(cluster);
kfree(ns);
o2hb_free_hb_set(o2hb_group);
- kfree(defs);
ret = ERR_PTR(-ENOMEM);
}
@@ -853,18 +709,11 @@ out:
static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
{
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
- int i;
- struct config_item *killme;
BUG_ON(o2nm_single_cluster != cluster);
o2nm_single_cluster = NULL;
- for (i = 0; cluster->cl_group.default_groups[i]; i++) {
- killme = &cluster->cl_group.default_groups[i]->cg_item;
- cluster->cl_group.default_groups[i] = NULL;
- config_item_put(killme);
- }
-
+ configfs_remove_default_groups(&cluster->cl_group);
config_item_put(item);
}
@@ -896,7 +745,7 @@ int o2nm_depend_item(struct config_item *item)
void o2nm_undepend_item(struct config_item *item)
{
- configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+ configfs_undepend_item(item);
}
int o2nm_depend_this_node(void)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ffecf89c8c1c..e1adf285fc31 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
mlog_errno(ret);
goto out;
}
- mutex_lock(&dx_alloc_inode->i_mutex);
+ inode_lock(dx_alloc_inode);
ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
if (ret) {
@@ -4410,7 +4410,7 @@ out_unlock:
ocfs2_inode_unlock(dx_alloc_inode, 1);
out_mutex:
- mutex_unlock(&dx_alloc_inode->i_mutex);
+ inode_unlock(dx_alloc_inode);
brelse(dx_alloc_bh);
out:
iput(dx_alloc_inode);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e88ccf8c83ff..004f2cbe8f71 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
#define DLM_LOCK_RES_DROPPING_REF 0x00000040
#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
#define DLM_LOCK_RES_SETREF_INPROG 0x00002000
+#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000
/* max milliseconds to wait to sync up a network failure with a node death */
#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -376,17 +377,6 @@ struct dlm_lock
lksb_kernel_allocated:1;
};
-
-#define DLM_LKSB_UNUSED1 0x01
-#define DLM_LKSB_PUT_LVB 0x02
-#define DLM_LKSB_GET_LVB 0x04
-#define DLM_LKSB_UNUSED2 0x08
-#define DLM_LKSB_UNUSED3 0x10
-#define DLM_LKSB_UNUSED4 0x20
-#define DLM_LKSB_UNUSED5 0x40
-#define DLM_LKSB_UNUSED6 0x80
-
-
enum dlm_lockres_list {
DLM_GRANTED_LIST = 0,
DLM_CONVERTING_LIST = 1,
@@ -462,6 +452,7 @@ enum {
DLM_QUERY_REGION = 519,
DLM_QUERY_NODEINFO = 520,
DLM_BEGIN_EXIT_DOMAIN_MSG = 521,
+ DLM_DEREF_LOCKRES_DONE = 522,
};
struct dlm_reco_node_data
@@ -556,7 +547,7 @@ struct dlm_master_requery
* };
*
* from ../cluster/tcp.h
- * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
+ * O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
* (roughly 4080 bytes)
* and sizeof(dlm_migratable_lockres) = 112 bytes
* and sizeof(dlm_migratable_lock) = 16 bytes
@@ -597,7 +588,7 @@ struct dlm_migratable_lockres
/* from above, 128 bytes
* for some undetermined future use */
-#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \
+#define DLM_MIG_LOCKRES_RESERVED (O2NET_MAX_PAYLOAD_BYTES - \
DLM_MIG_LOCKRES_MAX_LEN)
struct dlm_create_lock
@@ -793,6 +784,20 @@ struct dlm_deref_lockres
u8 name[O2NM_MAX_NAME_LEN];
};
+enum {
+ DLM_DEREF_RESPONSE_DONE = 0,
+ DLM_DEREF_RESPONSE_INPROG = 1,
+};
+
+struct dlm_deref_lockres_done {
+ u32 pad1;
+ u16 pad2;
+ u8 node_idx;
+ u8 namelen;
+
+ u8 name[O2NM_MAX_NAME_LEN];
+};
+
static inline enum dlm_status
__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
{
@@ -800,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
assert_spin_locked(&res->spinlock);
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
status = DLM_RECOVERING;
else if (res->state & DLM_LOCK_RES_MIGRATING)
status = DLM_MIGRATING;
@@ -979,6 +985,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
void dlm_assert_master_post_handler(int status, void *data, void *ret_data);
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data);
int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -1020,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
{
__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING|
DLM_LOCK_RES_MIGRATING));
}
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63ff1783..cdeafb4e7ed6 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -212,6 +212,12 @@ grant:
if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+ /*
+ * Move the lock to the tail because it may be the only lock which has
+ * an invalid lvb.
+ */
+ list_move_tail(&lock->list, &res->granted);
+
status = DLM_NORMAL;
*call_ast = 1;
goto unlock_exit;
@@ -262,6 +268,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
+ u8 old_owner = res->owner;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -287,6 +294,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
status = DLM_DENIED;
goto bail;
}
+
+ if (lock->ml.type == type && lock->ml.convert_type == LKM_IVMODE) {
+ mlog(0, "last convert request returned DLM_RECOVERING, but "
+ "owner has already queued and sent ast to me. res %.*s, "
+ "(cookie=%u:%llu, type=%d, conv=%d)\n",
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
+ lock->ml.type, lock->ml.convert_type);
+ status = DLM_NORMAL;
+ goto bail;
+ }
+
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
@@ -316,11 +336,19 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->convert_pending = 0;
- /* if it failed, move it back to granted queue */
+ /* if it failed, move it back to granted queue.
+ * if master returns DLM_NORMAL and then down before sending ast,
+ * it may have already been moved to granted queue, reset to
+ * DLM_RECOVERING and retry convert */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
+ } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
+ (old_owner != res->owner)) {
+ mlog(0, "res %.*s is in recovering or has been recovered.\n",
+ res->lockname.len, res->lockname.name);
+ status = DLM_RECOVERING;
}
bail:
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2ee7fe747cea..12e064b8be9a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -132,10 +132,13 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* - Message DLM_QUERY_NODEINFO added to allow online node removes
* New in version 1.2:
* - Message DLM_BEGIN_EXIT_DOMAIN_MSG added to mark start of exit domain
+ * New in version 1.3:
+ * - Message DLM_DEREF_LOCKRES_DONE added to inform non-master that the
+ * refmap is cleared
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
- .pv_minor = 2,
+ .pv_minor = 3,
};
#define DLM_DOMAIN_BACKOFF_MS 200
@@ -1396,7 +1399,7 @@ static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
unsigned int map_size)
{
int status, tmpstat;
- unsigned int node;
+ int node;
if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
sizeof(unsigned long))) {
@@ -1853,7 +1856,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
sizeof(struct dlm_exit_domain),
dlm_begin_exit_domain_handler,
dlm, NULL, &dlm->dlm_domain_handlers);
+ if (status)
+ goto bail;
+ status = o2net_register_handler(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ sizeof(struct dlm_deref_lockres_done),
+ dlm_deref_lockres_done_handler,
+ dlm, NULL, &dlm->dlm_domain_handlers);
bail:
if (status)
dlm_unregister_domain_handlers(dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ce38b4ccc9ab..9aed6e202201 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2278,7 +2278,7 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
dlm_print_one_lock_resource(res);
BUG();
}
- return ret;
+ return ret ? ret : r;
}
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -2345,7 +2345,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
res->lockname.len, res->lockname.name, node);
dlm_print_one_lock_resource(res);
}
- ret = 0;
+ ret = DLM_DEREF_RESPONSE_DONE;
goto done;
}
@@ -2365,7 +2365,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
spin_unlock(&dlm->work_lock);
queue_work(dlm->dlm_worker, &dlm->dispatched_work);
- return 0;
+ return DLM_DEREF_RESPONSE_INPROG;
done:
if (res)
@@ -2375,6 +2375,122 @@ done:
return ret;
}
+int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
+ void **ret_data)
+{
+ struct dlm_ctxt *dlm = data;
+ struct dlm_deref_lockres_done *deref
+ = (struct dlm_deref_lockres_done *)msg->buf;
+ struct dlm_lock_resource *res = NULL;
+ char *name;
+ unsigned int namelen;
+ int ret = -EINVAL;
+ u8 node;
+ unsigned int hash;
+
+ if (!dlm_grab(dlm))
+ return 0;
+
+ name = deref->name;
+ namelen = deref->namelen;
+ node = deref->node_idx;
+
+ if (namelen > DLM_LOCKID_NAME_MAX) {
+ mlog(ML_ERROR, "Invalid name length!");
+ goto done;
+ }
+ if (deref->node_idx >= O2NM_MAX_NODES) {
+ mlog(ML_ERROR, "Invalid node number: %u\n", node);
+ goto done;
+ }
+
+ hash = dlm_lockid_hash(name, namelen);
+
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
+ if (!res) {
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
+ dlm->name, namelen, name);
+ goto done;
+ }
+
+ spin_lock(&res->spinlock);
+ BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
+ if (!list_empty(&res->purge)) {
+ mlog(0, "%s: Removing res %.*s from purgelist\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ list_del_init(&res->purge);
+ dlm_lockres_put(res);
+ dlm->purge_count--;
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
+
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
+ /* lockres is not in the hash now. drop the flag and wake up
+ * any processes waiting in dlm_get_lock_resource.
+ */
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+
+ dlm_lockres_put(res);
+
+ spin_unlock(&dlm->spinlock);
+
+done:
+ dlm_put(dlm);
+ return ret;
+}
+
+static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res, u8 node)
+{
+ struct dlm_deref_lockres_done deref;
+ int ret = 0, r;
+ const char *lockname;
+ unsigned int namelen;
+
+ lockname = res->lockname.name;
+ namelen = res->lockname.len;
+ BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+
+ memset(&deref, 0, sizeof(deref));
+ deref.node_idx = dlm->node_num;
+ deref.namelen = namelen;
+ memcpy(deref.name, lockname, namelen);
+
+ ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
+ &deref, sizeof(deref), node, &r);
+ if (ret < 0) {
+ mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
+ " to node %u\n", dlm->name, namelen,
+ lockname, ret, node);
+ } else if (r < 0) {
+ /* ignore the error */
+ mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+ dlm->name, namelen, lockname, node, r);
+ dlm_print_one_lock_resource(res);
+ }
+}
+
static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
{
struct dlm_ctxt *dlm;
@@ -2388,13 +2504,15 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
spin_lock(&res->spinlock);
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+ __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
if (test_bit(node, res->refmap)) {
- __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
spin_unlock(&res->spinlock);
+ dlm_drop_lockres_ref_done(dlm, res, node);
+
if (cleared) {
mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
dlm->name, res->lockname.len, res->lockname.name, node);
@@ -2432,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
return 0;
/* delay migration when the lockres is in RECOCERING state */
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
return 0;
if (res->owner != dlm->node_num)
@@ -2519,6 +2638,11 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
+ /* get an extra reference on the mle.
+ * otherwise the assert_master from the new
+ * master will destroy this.
+ */
+ dlm_get_mle_inuse(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
@@ -2544,7 +2668,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
}
fail:
- if (oldmle) {
+ if (ret != -EEXIST && oldmle) {
/* master is known, detach if not already detached */
dlm_mle_detach_hb_events(dlm, oldmle);
dlm_put_mle(oldmle);
@@ -2554,6 +2678,7 @@ fail:
if (mle_added) {
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
mle = NULL;
@@ -2571,17 +2696,6 @@ fail:
* ensure that all assert_master work is flushed. */
flush_workqueue(dlm->dlm_worker);
- /* get an extra reference on the mle.
- * otherwise the assert_master from the new
- * master will destroy this.
- * also, make sure that all callers of dlm_get_mle
- * take both dlm->spinlock and dlm->master_lock */
- spin_lock(&dlm->spinlock);
- spin_lock(&dlm->master_lock);
- dlm_get_mle_inuse(mle);
- spin_unlock(&dlm->master_lock);
- spin_unlock(&dlm->spinlock);
-
/* notify new node and send all lock state */
/* call send_one_lockres with migration flag.
* this serves as notice to the target node that a
@@ -2843,6 +2957,8 @@ again:
res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
if (!ret)
BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+ else
+ res->migration_pending = 0;
spin_unlock(&res->spinlock);
/*
@@ -3048,7 +3164,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
int ret = 0;
if (!dlm_grab(dlm))
- return -EINVAL;
+ return 0;
name = migrate->name;
namelen = migrate->namelen;
@@ -3139,7 +3255,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
mlog(0, "tried to migrate %.*s, but some "
"process beat me to it\n",
namelen, name);
- ret = -EEXIST;
+ spin_unlock(&tmp->spinlock);
+ return -EEXIST;
} else {
/* bad. 2 NODES are trying to migrate! */
mlog(ML_ERROR, "migration error mle: "
@@ -3310,6 +3427,15 @@ top:
mle->new_master != dead_node)
continue;
+ if (mle->new_master == dead_node && mle->inuse) {
+ mlog(ML_NOTICE, "%s: target %u died during "
+ "migration from %u, the MLE is "
+ "still keep used, ignore it!\n",
+ dlm->name, dead_node,
+ mle->master);
+ continue;
+ }
+
/* If we have reached this point, this mle needs to be
* removed from the list and freed. */
dlm_clean_migration_mle(dlm, mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9e4f862d20fe..f6b313898763 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
char *buf = NULL;
struct dlm_work_item *item = NULL;
struct dlm_lock_resource *res = NULL;
+ unsigned int hash;
if (!dlm_grab(dlm))
return -EINVAL;
@@ -1400,11 +1401,26 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
/* lookup the lock to see if we have a secondary queue for this
* already... just add the locks in and this will have its owner
* and RECOVERY flag changed when it completes. */
- res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+ hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
+ spin_lock(&dlm->spinlock);
+ res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
+ hash);
if (res) {
/* this will get a ref on res */
/* mark it as recovering/migrating and hash it */
spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(0, "%s: node is attempting to migrate "
+ "lockres %.*s, but marked as dropping "
+ " ref!\n", dlm->name,
+ mres->lockname_len, mres->lockname);
+ ret = -EINVAL;
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ dlm_lockres_put(res);
+ goto leave;
+ }
+
if (mres->flags & DLM_MRES_RECOVERY) {
res->state |= DLM_LOCK_RES_RECOVERING;
} else {
@@ -1421,13 +1437,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
mres->lockname_len, mres->lockname);
ret = -EFAULT;
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
dlm_lockres_put(res);
goto leave;
}
res->state |= DLM_LOCK_RES_MIGRATING;
}
spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
} else {
+ spin_unlock(&dlm->spinlock);
/* need to allocate, just like if it was
* mastered here normally */
res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
@@ -2064,7 +2083,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
dlm_lock_get(lock);
if (lock->convert_pending) {
/* move converting lock back to granted */
- BUG_ON(i != DLM_CONVERTING_LIST);
mlog(0, "node died with convert pending "
"on %.*s. move back to granted list.\n",
res->lockname.len, res->lockname.name);
@@ -2156,6 +2174,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, bucket, hash_node) {
+ if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ }
+
if (!(res->state & DLM_LOCK_RES_RECOVERING))
continue;
@@ -2293,6 +2318,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, freed, dead_node);
__dlm_print_one_lock_resource(res);
}
+ res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
} else if (test_bit(dead_node, res->refmap)) {
mlog(0, "%s:%.*s: dead node %u had a ref, but had "
@@ -2360,6 +2386,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
break;
}
}
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ dead_node);
spin_unlock(&res->spinlock);
continue;
}
@@ -2368,14 +2396,16 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
dlm_revalidate_lvb(dlm, res, dead_node);
if (res->owner == dead_node) {
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
- mlog(ML_NOTICE, "%s: res %.*s, Skip "
- "recovery as it is being freed\n",
- dlm->name, res->lockname.len,
- res->lockname.name);
- } else
- dlm_move_lockres_to_recovery_list(dlm,
- res);
-
+ mlog(0, "%s:%.*s: owned by "
+ "dead node %u, this node was "
+ "dropping its ref when it died. "
+ "continue, dropping the flag.\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, dead_node);
+ }
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ dlm_move_lockres_to_recovery_list(dlm,
+ res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
@@ -2450,11 +2480,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
* perhaps later we can genericize this for other waiters. */
wake_up(&dlm->migration_wq);
- if (test_bit(idx, dlm->recovery_map))
- mlog(0, "domain %s, node %u already added "
- "to recovery map!\n", dlm->name, idx);
- else
- set_bit(idx, dlm->recovery_map);
+ set_bit(idx, dlm->recovery_map);
}
void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index c5f6c241ecd7..68d239ba0c63 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
return 0;
- if (res->state & DLM_LOCK_RES_RECOVERING)
+ if (res->state & (DLM_LOCK_RES_RECOVERING|
+ DLM_LOCK_RES_RECOVERY_WAITING))
return 0;
/* Another node has this resource with this node as the master */
@@ -202,6 +203,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
dlm->purge_count--;
}
+ if (!master && ret != 0) {
+ mlog(0, "%s: deref %.*s in progress or master goes down\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ spin_unlock(&res->spinlock);
+ return;
+ }
+
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
dlm->name, res->lockname.len, res->lockname.name);
@@ -700,7 +708,8 @@ static int dlm_thread(void *data)
* dirty for a short while. */
BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
- DLM_LOCK_RES_RECOVERING)) {
+ DLM_LOCK_RES_RECOVERING |
+ DLM_LOCK_RES_RECOVERY_WAITING)) {
/* move it to the tail and keep going */
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 2e3c9dbab68c..1082b2c3014b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
}
if (!dlm_grab(dlm))
- return DLM_REJECTED;
+ return DLM_FORWARD;
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b5cf27dcb18a..47b3b2d4e775 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -571,8 +571,8 @@ static int dlmfs_fill_super(struct super_block * sb,
int silent)
{
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = DLMFS_MAGIC;
sb->s_op = &dlmfs_ops;
sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
@@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void)
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
dlmfs_init_once);
if (!dlmfs_inode_cache) {
status = -ENOMEM;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 20276e340339..474e57f834e6 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
unsigned int gen;
int noqueue_attempted = 0;
int dlm_locked = 0;
+ int kick_dc = 0;
if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) {
mlog_errno(-EINVAL);
@@ -1524,7 +1525,12 @@ update_holders:
unlock:
lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
+ /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+ kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
+
spin_unlock_irqrestore(&lockres->l_lock, flags);
+ if (kick_dc)
+ ocfs2_wake_downconvert_thread(osb);
out:
/*
* This is helping work around a lock inversion between the page lock
@@ -2432,12 +2438,6 @@ bail:
* done this we have to return AOP_TRUNCATED_PAGE so the aop method
* that called us can bubble that back up into the VFS who will then
* immediately retry the aop call.
- *
- * We do a blocking lock and immediate unlock before returning, though, so that
- * the lock has a great chance of being cached on this node by the time the VFS
- * calls back to retry the aop. This has a potential to livelock as nodes
- * ping locks back and forth, but that's a risk we're willing to take to avoid
- * the lock inversion simply.
*/
int ocfs2_inode_lock_with_page(struct inode *inode,
struct buffer_head **ret_bh,
@@ -2449,8 +2449,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
if (ret == -EAGAIN) {
unlock_page(page);
- if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
- ocfs2_inode_unlock(inode, ex);
ret = AOP_TRUNCATED_PAGE;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e5b4515f92e..5308841756be 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -770,14 +770,14 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
+ unsigned long index = abs_from >> PAGE_SHIFT;
handle_t *handle;
int ret = 0;
unsigned zero_from, zero_to, block_start, block_end;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
BUG_ON(abs_from >= abs_to);
- BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+ BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
BUG_ON(abs_from & (inode->i_blkbits - 1));
handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
@@ -794,10 +794,10 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
}
/* Get the offsets within the page that we want to zero */
- zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
- zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+ zero_from = abs_from & (PAGE_SIZE - 1);
+ zero_to = abs_to & (PAGE_SIZE - 1);
if (!zero_to)
- zero_to = PAGE_CACHE_SIZE;
+ zero_to = PAGE_SIZE;
trace_ocfs2_write_zero_page(
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -851,7 +851,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
out_unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out_commit_trans:
if (handle)
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -959,7 +959,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
BUG_ON(range_start >= range_end);
while (zero_pos < range_end) {
- next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+ next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
if (next_pos > range_end)
next_pos = range_end;
rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
@@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt,
}
generic_fillattr(inode, stat);
+ /*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others don't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+ stat->blocks += (stat->size + 511)>>9;
/* We set the blksize from the cluster size for performance */
stat->blksize = osb->s_clustersize;
@@ -1373,44 +1381,6 @@ out:
return ret;
}
-/*
- * Will look for holes and unwritten extents in the range starting at
- * pos for count bytes (inclusive).
- */
-static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
- size_t count)
-{
- int ret = 0;
- unsigned int extent_flags;
- u32 cpos, clusters, extent_len, phys_cpos;
- struct super_block *sb = inode->i_sb;
-
- cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
- clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
-
- while (clusters) {
- ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
- &extent_flags);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
- ret = 1;
- break;
- }
-
- if (extent_len > clusters)
- extent_len = clusters;
-
- clusters -= extent_len;
- cpos += extent_len;
- }
-out:
- return ret;
-}
-
static int ocfs2_write_remove_suid(struct inode *inode)
{
int ret;
@@ -1864,7 +1834,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* This prevents concurrent writes on other nodes
@@ -1983,7 +1953,7 @@ out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return ret;
}
@@ -2121,18 +2091,12 @@ out:
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos,
- size_t count,
- int appending,
- int *direct_io,
- int *has_refcount)
+ size_t count)
{
int ret = 0, meta_level = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
loff_t end;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int full_coherency = !(osb->s_mount_opt &
- OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* We start with a read level meta lock and only jump to an ex
@@ -2181,10 +2145,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
pos,
count,
&meta_level);
- if (has_refcount)
- *has_refcount = 1;
- if (direct_io)
- *direct_io = 0;
}
if (ret < 0) {
@@ -2192,67 +2152,12 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
goto out_unlock;
}
- /*
- * Skip the O_DIRECT checks if we don't need
- * them.
- */
- if (!direct_io || !(*direct_io))
- break;
-
- /*
- * There's no sane way to do direct writes to an inode
- * with inline data.
- */
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Allowing concurrent direct writes means
- * i_size changes wouldn't be synchronized, so
- * one node could wind up truncating another
- * nodes writes.
- */
- if (end > i_size_read(inode) && !full_coherency) {
- *direct_io = 0;
- break;
- }
-
- /*
- * Fallback to old way if the feature bit is not set.
- */
- if (end > i_size_read(inode) &&
- !ocfs2_supports_append_dio(osb)) {
- *direct_io = 0;
- break;
- }
-
- /*
- * We don't fill holes during direct io, so
- * check for them here. If any are found, the
- * caller will have to retake some cluster
- * locks and initiate the io as buffered.
- */
- ret = ocfs2_check_range_for_holes(inode, pos, count);
- if (ret == 1) {
- /*
- * Fallback to old way if the feature bit is not set.
- * Otherwise try dio first and then complete the rest
- * request through buffer io.
- */
- if (!ocfs2_supports_append_dio(osb))
- *direct_io = 0;
- ret = 0;
- } else if (ret < 0)
- mlog_errno(ret);
break;
}
out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
- pos, appending, count,
- direct_io, has_refcount);
+ pos, count);
if (meta_level >= 0)
ocfs2_inode_unlock(inode, meta_level);
@@ -2264,18 +2169,16 @@ out:
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
- int direct_io, appending, rw_level;
- int can_do_direct, has_refcount = 0;
+ int direct_io, rw_level;
ssize_t written = 0;
ssize_t ret;
- size_t count = iov_iter_count(from), orig_count;
+ size_t count = iov_iter_count(from);
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
- int unaligned_dio = 0;
- int dropped_dio = 0;
+ void *saved_ki_complete = NULL;
int append_write = ((iocb->ki_pos + count) >=
i_size_read(inode) ? 1 : 0);
@@ -2288,12 +2191,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
if (count == 0)
return 0;
- appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
-relock:
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
@@ -2326,7 +2227,6 @@ relock:
ocfs2_inode_unlock(inode, 1);
}
- orig_count = iov_iter_count(from);
ret = generic_write_checks(iocb, from);
if (ret <= 0) {
if (ret)
@@ -2335,41 +2235,18 @@ relock:
}
count = ret;
- can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
- &can_do_direct, &has_refcount);
+ ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- if (direct_io && !is_sync_kiocb(iocb))
- unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
-
- /*
- * We can't complete the direct I/O as requested, fall back to
- * buffered I/O.
- */
- if (direct_io && !can_do_direct) {
- ocfs2_rw_unlock(inode, rw_level);
-
- rw_level = -1;
-
- direct_io = 0;
- iocb->ki_flags &= ~IOCB_DIRECT;
- iov_iter_reexpand(from, orig_count);
- dropped_dio = 1;
- goto relock;
- }
-
- if (unaligned_dio) {
+ if (direct_io && !is_sync_kiocb(iocb) &&
+ ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
/*
- * Wait on previous unaligned aio to complete before
- * proceeding.
+ * Make it a sync io if it's an unaligned aio.
*/
- mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
- /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
- ocfs2_iocb_set_unaligned_aio(iocb);
+ saved_ki_complete = xchg(&iocb->ki_complete, NULL);
}
/* communicate with ocfs2_dio_end_io */
@@ -2390,14 +2267,13 @@ relock:
*/
if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
rw_level = -1;
- unaligned_dio = 0;
}
if (unlikely(written <= 0))
- goto no_sync;
+ goto out;
if (((file->f_flags & O_DSYNC) && !direct_io) ||
- IS_SYNC(inode) || dropped_dio) {
+ IS_SYNC(inode)) {
ret = filemap_fdatawrite_range(file->f_mapping,
iocb->ki_pos - written,
iocb->ki_pos - 1);
@@ -2416,18 +2292,15 @@ relock:
iocb->ki_pos - 1);
}
-no_sync:
- if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
- ocfs2_iocb_clear_unaligned_aio(iocb);
- mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
- }
-
out:
+ if (saved_ki_complete)
+ xchg(&iocb->ki_complete, saved_ki_complete);
+
if (rw_level != -1)
ocfs2_rw_unlock(inode, rw_level);
out_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (written)
ret = written;
@@ -2539,7 +2412,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file->f_mapping->host;
int ret = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
switch (whence) {
case SEEK_SET:
@@ -2577,7 +2450,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret)
return ret;
return offset;
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
new file mode 100644
index 000000000000..2cabbcf2f28e
--- /dev/null
+++ b/fs/ocfs2/filecheck.c
@@ -0,0 +1,606 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.c
+ *
+ * Code which implements online file check.
+ *
+ * Copyright (C) 2016 SuSE. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/sysctl.h>
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "ocfs2_fs.h"
+#include "stackglue.h"
+#include "inode.h"
+
+#include "filecheck.h"
+
+
+/* File check error strings,
+ * must correspond with error number in header file.
+ */
+static const char * const ocfs2_filecheck_errs[] = {
+ "SUCCESS",
+ "FAILED",
+ "INPROGRESS",
+ "READONLY",
+ "INJBD",
+ "INVALIDINO",
+ "BLOCKECC",
+ "BLOCKNO",
+ "VALIDFLAG",
+ "GENERATION",
+ "UNSUPPORTED"
+};
+
+static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
+static LIST_HEAD(ocfs2_filecheck_sysfs_list);
+
+struct ocfs2_filecheck {
+ struct list_head fc_head; /* File check entry list head */
+ spinlock_t fc_lock;
+ unsigned int fc_max; /* Maximum number of entry in list */
+ unsigned int fc_size; /* Current entry count in list */
+ unsigned int fc_done; /* Finished entry count in list */
+};
+
+struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */
+ struct list_head fs_list;
+ atomic_t fs_count;
+ struct super_block *fs_sb;
+ struct kset *fs_devicekset;
+ struct kset *fs_fcheckkset;
+ struct ocfs2_filecheck *fs_fcheck;
+};
+
+#define OCFS2_FILECHECK_MAXSIZE 100
+#define OCFS2_FILECHECK_MINSIZE 10
+
+/* File check operation type */
+enum {
+ OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
+ OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
+ OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
+};
+
+struct ocfs2_filecheck_entry {
+ struct list_head fe_list;
+ unsigned long fe_ino;
+ unsigned int fe_type;
+ unsigned int fe_done:1;
+ unsigned int fe_status:31;
+};
+
+struct ocfs2_filecheck_args {
+ unsigned int fa_type;
+ union {
+ unsigned long fa_ino;
+ unsigned int fa_len;
+ };
+};
+
+static const char *
+ocfs2_filecheck_error(int errno)
+{
+ if (!errno)
+ return ocfs2_filecheck_errs[errno];
+
+ BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
+ errno > OCFS2_FILECHECK_ERR_END);
+ return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf);
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count);
+static struct kobj_attribute ocfs2_attr_filecheck_chk =
+ __ATTR(check, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+static struct kobj_attribute ocfs2_attr_filecheck_fix =
+ __ATTR(fix, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+static struct kobj_attribute ocfs2_attr_filecheck_set =
+ __ATTR(set, S_IRUSR | S_IWUSR,
+ ocfs2_filecheck_show,
+ ocfs2_filecheck_store);
+
+static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
+{
+ schedule();
+ return 0;
+}
+
+static void
+ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ if (!atomic_dec_and_test(&entry->fs_count))
+ wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+ TASK_UNINTERRUPTIBLE);
+
+ spin_lock(&entry->fs_fcheck->fc_lock);
+ while (!list_empty(&entry->fs_fcheck->fc_head)) {
+ p = list_first_entry(&entry->fs_fcheck->fc_head,
+ struct ocfs2_filecheck_entry, fe_list);
+ list_del(&p->fe_list);
+ BUG_ON(!p->fe_done); /* To free a undone file check entry */
+ kfree(p);
+ }
+ spin_unlock(&entry->fs_fcheck->fc_lock);
+
+ kset_unregister(entry->fs_fcheckkset);
+ kset_unregister(entry->fs_devicekset);
+ kfree(entry->fs_fcheck);
+ kfree(entry);
+}
+
+static void
+ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+}
+
+static int ocfs2_filecheck_sysfs_del(const char *devname)
+{
+ struct ocfs2_filecheck_sysfs_entry *p;
+
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+ if (!strcmp(p->fs_sb->s_id, devname)) {
+ list_del(&p->fs_list);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ ocfs2_filecheck_sysfs_free(p);
+ return 0;
+ }
+ }
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return 1;
+}
+
+static void
+ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
+{
+ if (atomic_dec_and_test(&entry->fs_count))
+ wake_up_atomic_t(&entry->fs_count);
+}
+
+static struct ocfs2_filecheck_sysfs_entry *
+ocfs2_filecheck_sysfs_get(const char *devname)
+{
+ struct ocfs2_filecheck_sysfs_entry *p = NULL;
+
+ spin_lock(&ocfs2_filecheck_sysfs_lock);
+ list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
+ if (!strcmp(p->fs_sb->s_id, devname)) {
+ atomic_inc(&p->fs_count);
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return p;
+ }
+ }
+ spin_unlock(&ocfs2_filecheck_sysfs_lock);
+ return NULL;
+}
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb)
+{
+ int ret = 0;
+ struct kset *device_kset = NULL;
+ struct kset *fcheck_kset = NULL;
+ struct ocfs2_filecheck *fcheck = NULL;
+ struct ocfs2_filecheck_sysfs_entry *entry = NULL;
+ struct attribute **attrs = NULL;
+ struct attribute_group attrgp;
+
+ if (!ocfs2_kset)
+ return -ENOMEM;
+
+ attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS);
+ if (!attrs) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ attrs[0] = &ocfs2_attr_filecheck_chk.attr;
+ attrs[1] = &ocfs2_attr_filecheck_fix.attr;
+ attrs[2] = &ocfs2_attr_filecheck_set.attr;
+ attrs[3] = NULL;
+ memset(&attrgp, 0, sizeof(attrgp));
+ attrgp.attrs = attrs;
+ }
+
+ fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
+ if (!fcheck) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ INIT_LIST_HEAD(&fcheck->fc_head);
+ spin_lock_init(&fcheck->fc_lock);
+ fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
+ fcheck->fc_size = 0;
+ fcheck->fc_done = 0;
+ }
+
+ if (strlen(sb->s_id) <= 0) {
+ mlog(ML_ERROR,
+ "Cannot get device basename when create filecheck sysfs\n");
+ ret = -ENODEV;
+ goto error;
+ }
+
+ device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj);
+ if (!device_kset) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ fcheck_kset = kset_create_and_add("filecheck", NULL,
+ &device_kset->kobj);
+ if (!fcheck_kset) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp);
+ if (ret)
+ goto error;
+
+ entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto error;
+ } else {
+ atomic_set(&entry->fs_count, 1);
+ entry->fs_sb = sb;
+ entry->fs_devicekset = device_kset;
+ entry->fs_fcheckkset = fcheck_kset;
+ entry->fs_fcheck = fcheck;
+ ocfs2_filecheck_sysfs_add(entry);
+ }
+
+ kfree(attrs);
+ return 0;
+
+error:
+ kfree(attrs);
+ kfree(entry);
+ kfree(fcheck);
+ kset_unregister(fcheck_kset);
+ kset_unregister(device_kset);
+ return ret;
+}
+
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb)
+{
+ return ocfs2_filecheck_sysfs_del(sb->s_id);
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count);
+static int
+ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int len)
+{
+ int ret;
+
+ if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE))
+ return -EINVAL;
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
+ mlog(ML_ERROR,
+ "Cannot set online file check maximum entry number "
+ "to %u due to too many pending entries(%u)\n",
+ len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
+ ret = -EBUSY;
+ } else {
+ if (len < ent->fs_fcheck->fc_size)
+ BUG_ON(!ocfs2_filecheck_erase_entries(ent,
+ ent->fs_fcheck->fc_size - len));
+
+ ent->fs_fcheck->fc_max = len;
+ ret = 0;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ return ret;
+}
+
+#define OCFS2_FILECHECK_ARGS_LEN 24
+static int
+ocfs2_filecheck_args_get_long(const char *buf, size_t count,
+ unsigned long *val)
+{
+ char buffer[OCFS2_FILECHECK_ARGS_LEN];
+
+ memcpy(buffer, buf, count);
+ buffer[count] = '\0';
+
+ if (kstrtoul(buffer, 0, val))
+ return 1;
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_type_parse(const char *name, unsigned int *type)
+{
+ if (!strncmp(name, "fix", 4))
+ *type = OCFS2_FILECHECK_TYPE_FIX;
+ else if (!strncmp(name, "check", 6))
+ *type = OCFS2_FILECHECK_TYPE_CHK;
+ else if (!strncmp(name, "set", 4))
+ *type = OCFS2_FILECHECK_TYPE_SET;
+ else
+ return 1;
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count,
+ struct ocfs2_filecheck_args *args)
+{
+ unsigned long val = 0;
+ unsigned int type;
+
+ /* too short/long args length */
+ if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN))
+ return 1;
+
+ if (ocfs2_filecheck_type_parse(name, &type))
+ return 1;
+ if (ocfs2_filecheck_args_get_long(buf, count, &val))
+ return 1;
+
+ if (val <= 0)
+ return 1;
+
+ args->fa_type = type;
+ if (type == OCFS2_FILECHECK_TYPE_SET)
+ args->fa_len = (unsigned int)val;
+ else
+ args->fa_ino = val;
+
+ return 0;
+}
+
+static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+
+ ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
+ unsigned int type;
+ struct ocfs2_filecheck_entry *p;
+ struct ocfs2_filecheck_sysfs_entry *ent;
+
+ if (ocfs2_filecheck_type_parse(attr->attr.name, &type))
+ return -EINVAL;
+
+ ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
+ if (!ent) {
+ mlog(ML_ERROR,
+ "Cannot get the corresponding entry via device basename %s\n",
+ kobj->name);
+ return -ENODEV;
+ }
+
+ if (type == OCFS2_FILECHECK_TYPE_SET) {
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max);
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+ goto exit;
+ }
+
+ ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n");
+ total += ret;
+ remain -= ret;
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (p->fe_type != type)
+ continue;
+
+ ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n",
+ p->fe_ino, p->fe_done,
+ ocfs2_filecheck_error(p->fe_status));
+ if (ret < 0) {
+ total = ret;
+ break;
+ }
+ if (ret == remain) {
+ /* snprintf() didn't fit */
+ total = -E2BIG;
+ break;
+ }
+ total += ret;
+ remain -= ret;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+exit:
+ ocfs2_filecheck_sysfs_put(ent);
+ return total;
+}
+
+static int
+ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
+{
+ struct ocfs2_filecheck_entry *p;
+
+ list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
+ if (p->fe_done) {
+ list_del(&p->fe_list);
+ kfree(p);
+ ent->fs_fcheck->fc_size--;
+ ent->fs_fcheck->fc_done--;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent,
+ unsigned int count)
+{
+ unsigned int i = 0;
+ unsigned int ret = 0;
+
+ while (i++ < count) {
+ if (ocfs2_filecheck_erase_entry(ent))
+ ret++;
+ else
+ break;
+ }
+
+ return (ret == count ? 1 : 0);
+}
+
+static void
+ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ entry->fe_done = 1;
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ ent->fs_fcheck->fc_done++;
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+}
+
+static unsigned int
+ocfs2_filecheck_handle(struct super_block *sb,
+ unsigned long ino, unsigned int flags)
+{
+ unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS;
+ struct inode *inode = NULL;
+ int rc;
+
+ inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0);
+ if (IS_ERR(inode)) {
+ rc = (int)(-(long)inode);
+ if (rc >= OCFS2_FILECHECK_ERR_START &&
+ rc < OCFS2_FILECHECK_ERR_END)
+ ret = rc;
+ else
+ ret = OCFS2_FILECHECK_ERR_FAILED;
+ } else
+ iput(inode);
+
+ return ret;
+}
+
+static void
+ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
+ struct ocfs2_filecheck_entry *entry)
+{
+ if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
+ entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
+ else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
+ entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb,
+ entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
+ else
+ entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
+
+ ocfs2_filecheck_done_entry(ent, entry);
+}
+
+static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ocfs2_filecheck_args args;
+ struct ocfs2_filecheck_entry *entry;
+ struct ocfs2_filecheck_sysfs_entry *ent;
+ ssize_t ret = 0;
+
+ if (count == 0)
+ return count;
+
+ if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) {
+ mlog(ML_ERROR, "Invalid arguments for online file check\n");
+ return -EINVAL;
+ }
+
+ ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
+ if (!ent) {
+ mlog(ML_ERROR,
+ "Cannot get the corresponding entry via device basename %s\n",
+ kobj->parent->name);
+ return -ENODEV;
+ }
+
+ if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
+ ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
+ goto exit;
+ }
+
+ entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS);
+ if (!entry) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ spin_lock(&ent->fs_fcheck->fc_lock);
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done == 0)) {
+ mlog(ML_ERROR,
+ "Cannot do more file check "
+ "since file check queue(%u) is full now\n",
+ ent->fs_fcheck->fc_max);
+ ret = -EBUSY;
+ kfree(entry);
+ } else {
+ if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
+ (ent->fs_fcheck->fc_done > 0)) {
+ /* Delete the oldest entry which was done,
+ * make sure the entry size in list does
+ * not exceed maximum value
+ */
+ BUG_ON(!ocfs2_filecheck_erase_entry(ent));
+ }
+
+ entry->fe_ino = args.fa_ino;
+ entry->fe_type = args.fa_type;
+ entry->fe_done = 0;
+ entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS;
+ list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head);
+ ent->fs_fcheck->fc_size++;
+ }
+ spin_unlock(&ent->fs_fcheck->fc_lock);
+
+ if (!ret)
+ ocfs2_filecheck_handle_entry(ent, entry);
+
+exit:
+ ocfs2_filecheck_sysfs_put(ent);
+ return (!ret ? count : ret);
+}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
new file mode 100644
index 000000000000..e5cd002a2c09
--- /dev/null
+++ b/fs/ocfs2/filecheck.h
@@ -0,0 +1,49 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * filecheck.h
+ *
+ * Online file check.
+ *
+ * Copyright (C) 2016 SuSE. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+
+#ifndef FILECHECK_H
+#define FILECHECK_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+
+/* File check errno */
+enum {
+ OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */
+ OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */
+ OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */
+ OCFS2_FILECHECK_ERR_READONLY, /* Read only */
+ OCFS2_FILECHECK_ERR_INJBD, /* Buffer in jbd */
+ OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */
+ OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */
+ OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */
+ OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */
+ OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */
+ OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */
+};
+
+#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED
+#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED
+
+int ocfs2_filecheck_create_sysfs(struct super_block *sb);
+int ocfs2_filecheck_remove_sysfs(struct super_block *sb);
+
+#endif /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8f87e05ee25d..12f4a9e9800f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
#include "xattr.h"
#include "refcounttree.h"
#include "ocfs2_trace.h"
+#include "filecheck.h"
#include "buffer_head_io.h"
@@ -74,6 +75,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh);
+static int ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh,
+ int flags, int type);
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+
void ocfs2_set_inode_flags(struct inode *inode)
{
unsigned int flags = OCFS2_I(inode)->ip_attr;
@@ -127,6 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
int sysfile_type)
{
+ int rc = 0;
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
@@ -161,12 +171,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
}
trace_ocfs2_iget5_locked(inode->i_state);
if (inode->i_state & I_NEW) {
- ocfs2_read_locked_inode(inode, &args);
+ rc = ocfs2_read_locked_inode(inode, &args);
unlock_new_inode(inode);
}
if (is_bad_inode(inode)) {
iput(inode);
- inode = ERR_PTR(-ESTALE);
+ if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) ||
+ (flags & OCFS2_FI_FLAG_FILECHECK_FIX))
+ /* Return OCFS2_FILECHECK_ERR_XXX related errno */
+ inode = ERR_PTR(rc);
+ else
+ inode = ERR_PTR(-ESTALE);
goto bail;
}
@@ -361,6 +376,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
break;
case S_IFLNK:
inode->i_op = &ocfs2_symlink_inode_operations;
+ inode_nohighmem(inode);
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
default:
@@ -409,7 +425,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
struct ocfs2_super *osb;
struct ocfs2_dinode *fe;
struct buffer_head *bh = NULL;
- int status, can_lock;
+ int status, can_lock, lock_level = 0;
u32 generation = 0;
status = -EINVAL;
@@ -477,7 +493,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
mlog_errno(status);
return status;
}
- status = ocfs2_inode_lock(inode, NULL, 0);
+ status = ocfs2_inode_lock(inode, NULL, lock_level);
if (status) {
make_bad_inode(inode);
mlog_errno(status);
@@ -494,16 +510,32 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
if (can_lock) {
- status = ocfs2_read_inode_block_full(inode, &bh,
- OCFS2_BH_IGNORE_CACHE);
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 0);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE, 1);
+ else
+ status = ocfs2_read_inode_block_full(inode,
+ &bh, OCFS2_BH_IGNORE_CACHE);
} else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
/*
* If buffer is in jbd, then its checksum may not have been
* computed as yet.
*/
- if (!status && !buffer_jbd(bh))
- status = ocfs2_validate_inode_block(osb->sb, bh);
+ if (!status && !buffer_jbd(bh)) {
+ if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK)
+ status = ocfs2_filecheck_validate_inode_block(
+ osb->sb, bh);
+ else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX)
+ status = ocfs2_filecheck_repair_inode_block(
+ osb->sb, bh);
+ else
+ status = ocfs2_validate_inode_block(
+ osb->sb, bh);
+ }
}
if (status < 0) {
mlog_errno(status);
@@ -531,11 +563,24 @@ static int ocfs2_read_locked_inode(struct inode *inode,
BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
+ if (buffer_dirty(bh) && !buffer_jbd(bh)) {
+ if (can_lock) {
+ ocfs2_inode_unlock(inode, lock_level);
+ lock_level = 1;
+ ocfs2_inode_lock(inode, NULL, lock_level);
+ }
+ status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = 0;
bail:
if (can_lock)
- ocfs2_inode_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, lock_level);
if (status < 0)
make_bad_inode(inode);
@@ -629,10 +674,10 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail;
}
- mutex_lock(&inode_alloc_inode->i_mutex);
+ inode_lock(inode_alloc_inode);
status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
if (status < 0) {
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
mlog_errno(status);
goto bail;
@@ -679,7 +724,7 @@ bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
ocfs2_inode_unlock(inode_alloc_inode, 1);
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
brelse(inode_alloc_bh);
bail:
iput(inode_alloc_inode);
@@ -750,10 +795,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
/* Lock the orphan dir. The lock will be held for the entire
* delete_inode operation. We do this now to avoid races with
* recovery completion on other nodes. */
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
mlog_errno(status);
goto bail;
@@ -802,7 +847,7 @@ bail_unlock_dir:
return status;
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
brelse(orphan_dir_bh);
bail:
iput(orphan_dir_inode);
@@ -1125,6 +1170,9 @@ static void ocfs2_clear_inode(struct inode *inode)
mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
"Clear inode of %llu, inode has io markers\n",
(unsigned long long)oi->ip_blkno);
+ mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list),
+ "Clear inode of %llu, inode has unwritten extents\n",
+ (unsigned long long)oi->ip_blkno);
ocfs2_extent_map_trunc(inode, 0);
@@ -1396,6 +1444,169 @@ bail:
return rc;
}
+static int ocfs2_filecheck_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ trace_ocfs2_filecheck_validate_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * Call ocfs2_validate_meta_ecc() first since it has ecc repair
+ * function, but we should not return error immediately when ecc
+ * validation fails, because the reason is quite likely the invalid
+ * inode number inputed.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+ if (rc) {
+ mlog(ML_ERROR,
+ "Filecheck: checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_BLOCKECC;
+ }
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7, di->i_signature);
+ rc = -OCFS2_FILECHECK_ERR_INVALIDINO;
+ goto bail;
+ } else if (rc)
+ goto bail;
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = -OCFS2_FILECHECK_ERR_BLOCKNO;
+ goto bail;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL "
+ "not set\n",
+ (unsigned long long)bh->b_blocknr);
+ rc = -OCFS2_FILECHECK_ERR_VALIDFLAG;
+ goto bail;
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ mlog(ML_ERROR,
+ "Filecheck: invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ rc = -OCFS2_FILECHECK_ERR_GENERATION;
+ goto bail;
+ }
+
+bail:
+ return rc;
+}
+
+static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int changed = 0;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ if (!ocfs2_filecheck_validate_inode_block(sb, bh))
+ return 0;
+
+ trace_ocfs2_filecheck_repair_inode_block(
+ (unsigned long long)bh->b_blocknr);
+
+ if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
+ ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
+ mlog(ML_ERROR,
+ "Filecheck: cannot repair dinode #%llu "
+ "on readonly filesystem\n",
+ (unsigned long long)bh->b_blocknr);
+ return -OCFS2_FILECHECK_ERR_READONLY;
+ }
+
+ if (buffer_jbd(bh)) {
+ mlog(ML_ERROR,
+ "Filecheck: cannot repair dinode #%llu, "
+ "its buffer is in jbd\n",
+ (unsigned long long)bh->b_blocknr);
+ return -OCFS2_FILECHECK_ERR_INJBD;
+ }
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ /* Cannot fix invalid inode block */
+ return -OCFS2_FILECHECK_ERR_INVALIDINO;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ /* Cannot just add VALID_FL flag back as a fix,
+ * need more things to check here.
+ */
+ return -OCFS2_FILECHECK_ERR_VALIDFLAG;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ di->i_blkno = cpu_to_le64(bh->b_blocknr);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
+ changed = 1;
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: fs_generation to %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ }
+
+ if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) {
+ ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check);
+ mark_buffer_dirty(bh);
+ mlog(ML_ERROR,
+ "Filecheck: reset dinode #%llu: compute meta ecc\n",
+ (unsigned long long)bh->b_blocknr);
+ }
+
+ return 0;
+}
+
+static int
+ocfs2_filecheck_read_inode_block_full(struct inode *inode,
+ struct buffer_head **bh,
+ int flags, int type)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ if (!type) /* Check inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_validate_inode_block);
+ else /* Repair inode block */
+ rc = ocfs2_read_blocks(INODE_CACHE(inode),
+ OCFS2_I(inode)->ip_blkno,
+ 1, &tmp, flags,
+ ocfs2_filecheck_repair_inode_block);
+
+ /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
int flags)
{
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index aac8b86f312e..d8f3fc8d2551 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,9 +43,6 @@ struct ocfs2_inode_info
/* protects extended attribute changes on this inode */
struct rw_semaphore ip_xattr_sem;
- /* Number of outstanding AIO's which are not page aligned */
- struct mutex ip_unaligned_aio;
-
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
@@ -57,6 +54,9 @@ struct ocfs2_inode_info
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
+ /* Record unwritten extents during direct io. */
+ struct list_head ip_unwritten_list;
+
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
@@ -139,6 +139,9 @@ int ocfs2_drop_inode(struct inode *inode);
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
+#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4
+#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8
+
struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 3cb097ccce60..4506ec5ec2ea 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
unsigned oldflags;
int status;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
@@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
bail_unlock:
ocfs2_inode_unlock(inode, 1);
bail:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
brelse(bh);
@@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
struct ocfs2_dinode *dinode_alloc = NULL;
if (inode_alloc)
- mutex_lock(&inode_alloc->i_mutex);
+ inode_lock(inode_alloc);
if (o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
@@ -317,7 +317,7 @@ bail:
ocfs2_inode_unlock(inode_alloc, 0);
if (inode_alloc)
- mutex_unlock(&inode_alloc->i_mutex);
+ inode_unlock(inode_alloc);
brelse(bh);
@@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
struct ocfs2_dinode *gb_dinode = NULL;
if (gb_inode)
- mutex_lock(&gb_inode->i_mutex);
+ inode_lock(gb_inode);
if (o2info_coherent(&ffg->iff_req)) {
status = ocfs2_inode_lock(gb_inode, &bh, 0);
@@ -604,11 +604,9 @@ bail:
ocfs2_inode_unlock(gb_inode, 0);
if (gb_inode)
- mutex_unlock(&gb_inode->i_mutex);
-
- if (gb_inode)
- iput(gb_inode);
+ inode_unlock(gb_inode);
+ iput(gb_inode);
brelse(bh);
return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 13534f4fe5b5..e607419cdfa4 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -231,7 +231,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb)
/* At this point, we know that no more recovery threads can be
* launched, so wait for any recovery completion work to
* complete. */
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
/*
* Now that recovery is shut down, and the osb is about to be
@@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
// up_write(&journal->j_trans_barrier);
done:
- if (inode)
- iput(inode);
+ iput(inode);
}
static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1327,7 +1326,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
- queue_work(ocfs2_wq, &journal->j_recovery_work);
+ queue_work(journal->j_osb->ocfs2_wq, &journal->j_recovery_work);
spin_unlock(&journal->j_lock);
}
@@ -1687,9 +1686,7 @@ done:
if (got_lock)
ocfs2_inode_unlock(inode, 1);
- if (inode)
- iput(inode);
-
+ iput(inode);
brelse(bh);
return status;
@@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
ocfs2_inode_unlock(inode, 1);
bail:
- if (inode)
- iput(inode);
+ iput(inode);
return status;
}
@@ -1972,7 +1968,7 @@ static void ocfs2_orphan_scan_work(struct work_struct *work)
mutex_lock(&os->os_lock);
ocfs2_queue_orphan_scan(osb);
if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
mutex_unlock(&os->os_lock);
}
@@ -2012,7 +2008,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
else {
atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
- queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+ queue_delayed_work(osb->ocfs2_wq, &os->os_orphan_scan_work,
ocfs2_orphan_scan_timeout());
}
}
@@ -2092,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
return status;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
if (status < 0) {
mlog_errno(status);
@@ -2110,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
out_cluster:
ocfs2_inode_unlock(orphan_dir_inode, 0);
out:
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
return status;
}
@@ -2200,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
oi->ip_next_orphan = NULL;
if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = ocfs2_rw_lock(inode, 1);
if (ret < 0) {
mlog_errno(ret);
@@ -2239,7 +2235,7 @@ unlock_inode:
unlock_rw:
ocfs2_rw_unlock(inode, 1);
unlock_mutex:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/* clear dio flag in ocfs2_inode_info */
oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0a4457fb0711..fe0d1f9571bb 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
bail:
if (status < 0)
brelse(alloc_bh);
- if (inode)
- iput(inode);
+ iput(inode);
trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
@@ -387,7 +386,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
struct ocfs2_dinode *alloc = NULL;
cancel_delayed_work(&osb->la_enable_wq);
- flush_workqueue(ocfs2_wq);
+ flush_workqueue(osb->ocfs2_wq);
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
@@ -415,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
@@ -469,12 +468,11 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
- if (local_alloc_inode)
- iput(local_alloc_inode);
+ iput(local_alloc_inode);
kfree(alloc_copy);
}
@@ -508,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
goto bail;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
status = ocfs2_read_inode_block_full(inode, &alloc_bh,
OCFS2_BH_IGNORE_CACHE);
@@ -541,7 +539,7 @@ bail:
brelse(alloc_bh);
if (inode) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
}
@@ -573,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (status < 0) {
@@ -603,7 +601,7 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
brelse(main_bm_bh);
@@ -645,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
goto bail;
}
- mutex_lock(&local_alloc_inode->i_mutex);
+ inode_lock(local_alloc_inode);
/*
* We must double check state and allocator bits because
@@ -711,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
status = 0;
bail:
if (status < 0 && local_alloc_inode) {
- mutex_unlock(&local_alloc_inode->i_mutex);
+ inode_unlock(local_alloc_inode);
iput(local_alloc_inode);
}
@@ -1087,7 +1085,7 @@ static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
} else {
osb->local_alloc_state = OCFS2_LA_DISABLED;
}
- queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+ queue_delayed_work(osb->ocfs2_wq, &osb->la_enable_wq,
OCFS2_LA_ENABLE_INTERVAL);
goto out_unlock;
}
@@ -1327,9 +1325,7 @@ bail:
brelse(main_bm_bh);
- if (main_bm_inode)
- iput(main_bm_inode);
-
+ iput(main_bm_inode);
kfree(alloc_copy);
if (ac)
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 652ece4a9d9e..d56f0079b858 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -67,7 +67,10 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
*/
locks_lock_file_wait(file,
- &(struct file_lock){.fl_type = F_UNLCK});
+ &(struct file_lock) {
+ .fl_type = F_UNLCK,
+ .fl_flags = FL_FLOCK
+ });
ocfs2_file_unlock(file);
}
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9581d190f6e1..71545ad4628c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -65,13 +65,13 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
- unsigned int len = PAGE_CACHE_SIZE;
+ unsigned int len = PAGE_SIZE;
pgoff_t last_index;
struct page *locked_page = NULL;
void *fsdata;
loff_t size = i_size_read(inode);
- last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+ last_index = (size - 1) >> PAGE_SHIFT;
/*
* There are cases that lead to the page no longer bebongs to the
@@ -102,10 +102,10 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
* because the "write" would invalidate their data.
*/
if (page->index == last_index)
- len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
+ len = ((size - 1) & ~PAGE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
- &fsdata, di_bh, page);
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+ &locked_page, &fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
@@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 124471d26a73..e3d05d9901a3 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
* context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
*/
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
@@ -338,7 +338,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock_mutex:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
if (context->data_ac) {
ocfs2_free_alloc_context(context->data_ac);
@@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
goto out;
}
- mutex_lock(&gb_inode->i_mutex);
+ inode_lock(gb_inode);
ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
if (ret) {
@@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
goto out_unlock_gb_mutex;
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
@@ -708,11 +708,11 @@ out_commit:
brelse(gd_bh);
out_unlock_tl_inode:
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
ocfs2_inode_unlock(gb_inode, 1);
out_unlock_gb_mutex:
- mutex_unlock(&gb_inode->i_mutex);
+ inode_unlock(gb_inode);
brelse(gb_bh);
iput(gb_inode);
@@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/*
* This prevents concurrent writes from other nodes
@@ -969,7 +969,7 @@ out_inode_unlock:
out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return status;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 3b48ac25d8a7..6b3e87189a64 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -367,7 +367,7 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = posix_acl_create(dir, &mode, &default_acl, &acl);
+ status = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (status) {
mlog_errno(status);
goto leave;
@@ -1045,7 +1045,7 @@ leave:
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
iput(orphan_dir);
}
@@ -1664,7 +1664,7 @@ bail:
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
iput(orphan_dir);
}
@@ -1683,8 +1683,7 @@ bail:
if (new_inode)
sync_mapping_buffers(old_inode->i_mapping);
- if (new_inode)
- iput(new_inode);
+ iput(new_inode);
ocfs2_free_dir_lookup_result(&target_lookup_res);
ocfs2_free_dir_lookup_result(&old_entry_lookup);
@@ -1958,6 +1957,7 @@ static int ocfs2_symlink(struct inode *dir,
inode->i_rdev = 0;
newsize = l - 1;
inode->i_op = &ocfs2_symlink_inode_operations;
+ inode_nohighmem(inode);
if (l > ocfs2_fast_symlink_chars(sb)) {
u32 offset = 0;
@@ -2121,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
return ret;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (ret < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
mlog_errno(ret);
@@ -2226,7 +2226,7 @@ out:
if (ret) {
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
}
@@ -2372,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
(unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
name, strlen(name));
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(orphan_dir_inode),
+ orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
/* find it's spot in the orphan directory */
status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
&lookup);
@@ -2387,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access_di(handle,
- INODE_CACHE(orphan_dir_inode),
- orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
/* do the i_nlink dance! :) */
orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
if (S_ISDIR(inode->i_mode))
@@ -2495,7 +2495,7 @@ out:
ocfs2_free_alloc_context(inode_ac);
/* Unroll orphan dir locking */
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
ocfs2_inode_unlock(orphan_dir, 1);
iput(orphan_dir);
}
@@ -2602,7 +2602,7 @@ leave:
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
- mutex_unlock(&orphan_dir->i_mutex);
+ inode_unlock(orphan_dir);
iput(orphan_dir);
}
@@ -2689,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
bail_unlock_orphan:
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
ocfs2_free_dir_lookup_result(&orphan_insert);
@@ -2721,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
goto bail;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
mlog_errno(status);
goto bail;
@@ -2770,7 +2770,7 @@ bail_commit:
bail_unlock_orphan:
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
brelse(orphan_dir_bh);
iput(orphan_dir_inode);
@@ -2834,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
goto leave;
}
- mutex_lock(&orphan_dir_inode->i_mutex);
+ inode_lock(orphan_dir_inode);
status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
if (status < 0) {
mlog_errno(status);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
goto leave;
}
@@ -2901,7 +2901,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
orphan_unlock:
ocfs2_inode_unlock(orphan_dir_inode, 1);
- mutex_unlock(&orphan_dir_inode->i_mutex);
+ inode_unlock(orphan_dir_inode);
iput(orphan_dir_inode);
leave:
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7a0126267847..e63af7ddfe68 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -464,6 +464,14 @@ struct ocfs2_super
struct ocfs2_refcount_tree *osb_ref_tree_lru;
struct mutex system_file_mutex;
+
+ /*
+ * OCFS2 needs to schedule several different types of work which
+ * require cluster locking, disk I/O, recovery waits, etc. Since these
+ * types of work tend to be heavy we avoid using the kernel events
+ * workqueue and schedule on our own.
+ */
+ struct workqueue_struct *ocfs2_wq;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -814,10 +822,10 @@ static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
u32 clusters = pg_index;
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
- if (unlikely(PAGE_CACHE_SHIFT > cbits))
- clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
- else if (PAGE_CACHE_SHIFT < cbits)
- clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+ if (unlikely(PAGE_SHIFT > cbits))
+ clusters = pg_index << (PAGE_SHIFT - cbits);
+ else if (PAGE_SHIFT < cbits)
+ clusters = pg_index >> (cbits - PAGE_SHIFT);
return clusters;
}
@@ -831,10 +839,10 @@ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb,
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
pgoff_t index = clusters;
- if (PAGE_CACHE_SHIFT > cbits) {
- index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits);
- } else if (PAGE_CACHE_SHIFT < cbits) {
- index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT);
+ if (PAGE_SHIFT > cbits) {
+ index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits);
+ } else if (PAGE_SHIFT < cbits) {
+ index = (pgoff_t)clusters << (cbits - PAGE_SHIFT);
}
return index;
@@ -845,8 +853,8 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
unsigned int pages_per_cluster = 1;
- if (PAGE_CACHE_SHIFT < cbits)
- pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+ if (PAGE_SHIFT < cbits)
+ pages_per_cluster = 1 << (cbits - PAGE_SHIFT);
return pages_per_cluster;
}
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 6cb019b7c6a8..f8f5fc5e6c05 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,28 +1450,20 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
TRACE_EVENT(ocfs2_prepare_inode_for_write,
TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
- int appending, unsigned long count,
- int *direct_io, int *has_refcount),
- TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+ unsigned long count),
+ TP_ARGS(ino, saved_pos, count),
TP_STRUCT__entry(
__field(unsigned long long, ino)
__field(unsigned long long, saved_pos)
- __field(int, appending)
__field(unsigned long, count)
- __field(int, direct_io)
- __field(int, has_refcount)
),
TP_fast_assign(
__entry->ino = ino;
__entry->saved_pos = saved_pos;
- __entry->appending = appending;
__entry->count = count;
- __entry->direct_io = direct_io ? *direct_io : -1;
- __entry->has_refcount = has_refcount ? *has_refcount : -1;
),
- TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
- __entry->saved_pos, __entry->appending, __entry->count,
- __entry->direct_io, __entry->has_refcount)
+ TP_printk("%llu %llu %lu", __entry->ino,
+ __entry->saved_pos, __entry->count)
);
DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
@@ -1540,6 +1532,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block);
TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
TP_PROTO(void *task, void *dc_task, unsigned long long ino,
@@ -2035,6 +2029,8 @@ DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_get_next_id);
+
DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
/* End of trace events for fs/ocfs2/quota_global.c. */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index b6d51333ad02..d153e6e31529 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -82,7 +82,7 @@ struct ocfs2_quota_chunk {
extern struct kmem_cache *ocfs2_dquot_cachep;
extern struct kmem_cache *ocfs2_qf_chunk_cachep;
-extern struct qtree_fmt_operations ocfs2_global_ops;
+extern const struct qtree_fmt_operations ocfs2_global_ops;
struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
struct ocfs2_super *osb, int slot_num);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index c93d67220887..ab6a6cdcf91c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -123,7 +123,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
dquot->dq_id);
}
-struct qtree_fmt_operations ocfs2_global_ops = {
+const struct qtree_fmt_operations ocfs2_global_ops = {
.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
.disk2mem_dqblk = ocfs2_global_disk2memdqb,
.is_id = ocfs2_global_is_id,
@@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
WARN_ON(bh != oinfo->dqi_gqi_bh);
spin_unlock(&dq_data_lock);
if (ex) {
- mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+ inode_lock(oinfo->dqi_gqinode);
down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
} else {
down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
@@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
{
if (ex) {
up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
- mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+ inode_unlock(oinfo->dqi_gqinode);
} else {
up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
}
@@ -726,7 +726,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
dqgrab(dquot);
/* First entry on list -> queue work */
if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
- queue_work(ocfs2_wq, &osb->dquot_drop_work);
+ queue_work(osb->ocfs2_wq, &osb->dquot_drop_work);
goto out;
}
status = ocfs2_lock_global_qf(oinfo, 1);
@@ -860,6 +860,37 @@ out:
return status;
}
+static int ocfs2_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ int type = qid->type;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ int status = 0;
+
+ trace_ocfs2_get_next_id(from_kqid(&init_user_ns, *qid), type);
+ if (!sb_has_quota_loaded(sb, type)) {
+ status = -ESRCH;
+ goto out;
+ }
+ status = ocfs2_lock_global_qf(info, 0);
+ if (status < 0)
+ goto out;
+ status = ocfs2_qinfo_lock(info, 0);
+ if (status < 0)
+ goto out_global;
+ status = qtree_get_next_id(&info->dqi_gi, qid);
+ ocfs2_qinfo_unlock(info, 0);
+out_global:
+ ocfs2_unlock_global_qf(info, 0);
+out:
+ /*
+ * Avoid logging ENOENT since it just means there isn't next ID and
+ * ESRCH which means quota isn't enabled for the filesystem.
+ */
+ if (status && status != -ENOENT && status != -ESRCH)
+ mlog_errno(status);
+ return status;
+}
+
static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
{
unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
@@ -968,4 +999,5 @@ const struct dquot_operations ocfs2_quota_operations = {
.write_info = ocfs2_write_info,
.alloc_dquot = ocfs2_alloc_dquot,
.destroy_dquot = ocfs2_destroy_dquot,
+ .get_next_id = ocfs2_get_next_id,
};
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 252119860e6c..744d5d90c363 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
mlog_errno(ret);
goto out;
}
- mutex_lock(&alloc_inode->i_mutex);
+ inode_lock(alloc_inode);
ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
if (ret) {
@@ -867,7 +867,7 @@ out_unlock:
}
out_mutex:
if (alloc_inode) {
- mutex_unlock(&alloc_inode->i_mutex);
+ inode_unlock(alloc_inode);
iput(alloc_inode);
}
out:
@@ -2937,16 +2937,16 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
end = i_size_read(inode);
while (offset < end) {
- page_index = offset >> PAGE_CACHE_SHIFT;
- map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+ page_index = offset >> PAGE_SHIFT;
+ map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
if (map_end > end)
map_end = end;
/* from, to is the offset within the page. */
- from = offset & (PAGE_CACHE_SIZE - 1);
- to = PAGE_CACHE_SIZE;
- if (map_end & (PAGE_CACHE_SIZE - 1))
- to = map_end & (PAGE_CACHE_SIZE - 1);
+ from = offset & (PAGE_SIZE - 1);
+ to = PAGE_SIZE;
+ if (map_end & (PAGE_SIZE - 1))
+ to = map_end & (PAGE_SIZE - 1);
page = find_or_create_page(mapping, page_index, GFP_NOFS);
if (!page) {
@@ -2956,10 +2956,10 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
/*
- * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
+ * In case PAGE_SIZE <= CLUSTER_SIZE, This page
* can't be dirtied before we CoW it out.
*/
- if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
+ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize)
BUG_ON(PageDirty(page));
if (!PageUptodate(page)) {
@@ -2987,7 +2987,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
mark_page_accessed(page);
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
offset = map_end;
if (ret)
@@ -3165,8 +3165,8 @@ int ocfs2_cow_sync_writeback(struct super_block *sb,
}
while (offset < end) {
- page_index = offset >> PAGE_CACHE_SHIFT;
- map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+ page_index = offset >> PAGE_SHIFT;
+ map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
if (map_end > end)
map_end = end;
@@ -3182,7 +3182,7 @@ int ocfs2_cow_sync_writeback(struct super_block *sb,
mark_page_accessed(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
offset = map_end;
if (ret)
@@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
goto out;
}
- mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(new_inode, I_MUTEX_CHILD);
ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
OI_LS_REFLINK_TARGET);
if (ret) {
@@ -4231,7 +4231,7 @@ inode_unlock:
ocfs2_inode_unlock(new_inode, 1);
brelse(new_bh);
out_unlock:
- mutex_unlock(&new_inode->i_mutex);
+ inode_unlock(new_inode);
out:
if (!ret) {
ret = filemap_fdatawait(inode->i_mapping);
@@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
return error;
}
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = dquot_initialize(dir);
if (!error)
error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (!error)
fsnotify_create(dir, new_dentry);
return error;
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index d5da6f624142..18451e0fab81 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -54,11 +54,12 @@
static u16 ocfs2_calc_new_backup_super(struct inode *inode,
struct ocfs2_group_desc *gd,
u16 cl_cpg,
+ u16 old_bg_clusters,
int set)
{
int i;
u16 backups = 0;
- u32 cluster;
+ u32 cluster, lgd_cluster;
u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno);
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
@@ -71,6 +72,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
else if (gd_blkno > lgd_blkno)
break;
+ /* check if already done backup super */
+ lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno);
+ lgd_cluster += old_bg_clusters;
+ if (lgd_cluster >= cluster)
+ continue;
+
if (set)
ocfs2_set_bit(cluster % cl_cpg,
(unsigned long *)gd->bg_bitmap);
@@ -99,6 +106,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
u16 chain, num_bits, backups = 0;
u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
+ u16 old_bg_clusters;
trace_ocfs2_update_last_group_and_inode(new_clusters,
first_new_cluster);
@@ -112,6 +120,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
group = (struct ocfs2_group_desc *)group_bh->b_data;
+ old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc;
/* update the group first. */
num_bits = new_clusters * cl_bpc;
le16_add_cpu(&group->bg_bits, num_bits);
@@ -125,7 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
backups = ocfs2_calc_new_backup_super(bm_inode,
group,
- cl_cpg, 1);
+ cl_cpg, old_bg_clusters, 1);
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
@@ -163,7 +172,7 @@ out_rollback:
if (ret < 0) {
ocfs2_calc_new_backup_super(bm_inode,
group,
- cl_cpg, 0);
+ cl_cpg, old_bg_clusters, 0);
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
@@ -187,7 +196,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
blkno = ocfs2_backup_super_blkno(inode->i_sb, i);
cluster = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
- if (cluster > clusters)
+ if (cluster >= clusters)
break;
ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
@@ -292,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (ret < 0) {
@@ -366,7 +375,7 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
@@ -477,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
goto out;
}
- mutex_lock(&main_bm_inode->i_mutex);
+ inode_lock(main_bm_inode);
ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
if (ret < 0) {
@@ -581,7 +590,7 @@ out_unlock:
ocfs2_inode_unlock(main_bm_inode, 1);
out_mutex:
- mutex_unlock(&main_bm_inode->i_mutex);
+ inode_unlock(main_bm_inode);
iput(main_bm_inode);
out:
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e78a203d44c8..1e09592148ad 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
if (si == NULL)
return;
- if (si->si_inode)
- iput(si->si_inode);
+ iput(si->si_inode);
if (si->si_bh) {
for (i = 0; i < si->si_blocks; i++) {
if (si->si_bh[i]) {
@@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
trace_ocfs2_find_slot(osb->slot_num);
status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ /*
+ * if write block failed, invalidate slot to avoid overwrite
+ * slot during dismount in case another node rightly has mounted
+ */
+ spin_lock(&osb->osb_lock);
+ ocfs2_invalidate_slot(si, osb->slot_num);
+ osb->slot_num = OCFS2_INVALID_SLOT;
+ spin_unlock(&osb->osb_lock);
+ }
bail:
return status;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 5d965e83bd43..13219ed73e1d 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = {
.attrs = ocfs2_attrs,
};
-static struct kset *ocfs2_kset;
+struct kset *ocfs2_kset;
+EXPORT_SYMBOL_GPL(ocfs2_kset);
static void ocfs2_sysfs_exit(void)
{
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 66334a30cea8..f2dce10fae54 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+extern struct kset *ocfs2_kset;
+
#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index fc6d25f6d444..2f19aeec5482 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
if (ac->ac_which != OCFS2_AC_USE_LOCAL)
ocfs2_inode_unlock(inode, 1);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
iput(inode);
ac->ac_inode = NULL;
@@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
return -EINVAL;
}
- mutex_lock(&alloc_inode->i_mutex);
+ inode_lock(alloc_inode);
status = ocfs2_inode_lock(alloc_inode, &bh, 1);
if (status < 0) {
- mutex_unlock(&alloc_inode->i_mutex);
+ inode_unlock(alloc_inode);
iput(alloc_inode);
mlog_errno(status);
@@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
goto bail;
}
- mutex_lock(&inode_alloc_inode->i_mutex);
+ inode_lock(inode_alloc_inode);
status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
if (status < 0) {
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
iput(inode_alloc_inode);
mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
(u32)suballoc_slot, status);
@@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
ocfs2_inode_unlock(inode_alloc_inode, 0);
- mutex_unlock(&inode_alloc_inode->i_mutex);
+ inode_unlock(inode_alloc_inode);
iput(inode_alloc_inode);
brelse(alloc_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2de4c8a9340c..d7cae3327de5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -74,17 +74,12 @@
#include "suballoc.h"
#include "buffer_head_io.h"
+#include "filecheck.h"
static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several different types of work which
- * require cluster locking, disk I/O, recovery waits, etc. Since these
- * types of work tend to be heavy we avoid using the kernel events
- * workqueue and schedule on our own. */
-struct workqueue_struct *ocfs2_wq = NULL;
-
static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
@@ -236,6 +231,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
struct ocfs2_recovery_map *rm = osb->recovery_map;
struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan;
int i, out = 0;
+ unsigned long flags;
out += snprintf(buf + out, len - out,
"%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n",
@@ -271,14 +267,14 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
cconn->cc_version.pv_minor);
}
- spin_lock(&osb->dc_task_lock);
+ spin_lock_irqsave(&osb->dc_task_lock, flags);
out += snprintf(buf + out, len - out,
"%10s => Pid: %d Count: %lu WakeSeq: %lu "
"WorkSeq: %lu\n", "DownCnvt",
(osb->dc_task ? task_pid_nr(osb->dc_task) : -1),
osb->blocked_lock_count, osb->dc_wake_sequence,
osb->dc_work_sequence);
- spin_unlock(&osb->dc_task_lock);
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags);
spin_lock(&osb->osb_lock);
out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:",
@@ -609,8 +605,8 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
/*
* We might be limited by page cache size.
*/
- if (bytes > PAGE_CACHE_SIZE) {
- bytes = PAGE_CACHE_SIZE;
+ if (bytes > PAGE_SIZE) {
+ bytes = PAGE_SIZE;
trim = 1;
/*
* Shift by 31 here so that we don't get larger than
@@ -1204,6 +1200,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
/* Start this when the mount is almost sure of being successful */
ocfs2_orphan_scan_start(osb);
+ /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
+ ocfs2_filecheck_create_sysfs(sb);
+
return status;
read_super_error:
@@ -1280,6 +1279,8 @@ static int ocfs2_parse_options(struct super_block *sb,
int status, user_stack = 0;
char *p;
u32 tmp;
+ int token, option;
+ substring_t args[MAX_OPT_ARGS];
trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
@@ -1298,9 +1299,6 @@ static int ocfs2_parse_options(struct super_block *sb,
}
while ((p = strsep(&options, ",")) != NULL) {
- int token, option;
- substring_t args[MAX_OPT_ARGS];
-
if (!*p)
continue;
@@ -1367,7 +1365,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->atime_quantum = option;
break;
case Opt_slot:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1376,7 +1373,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->slot = (s16)option;
break;
case Opt_commit:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1388,7 +1384,6 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->commit_interval = HZ * option;
break;
case Opt_localalloc:
- option = 0;
if (match_int(&args[0], &option)) {
status = 0;
goto bail;
@@ -1612,33 +1607,25 @@ static int __init ocfs2_init(void)
if (status < 0)
goto out2;
- ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
- if (!ocfs2_wq) {
- status = -ENOMEM;
- goto out3;
- }
-
ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
if (!ocfs2_debugfs_root) {
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
- goto out4;
+ goto out3;
}
ocfs2_set_locking_protocol();
status = register_quota_format(&ocfs2_quota_format);
if (status < 0)
- goto out4;
+ goto out3;
status = register_filesystem(&ocfs2_fs_type);
if (!status)
return 0;
unregister_quota_format(&ocfs2_quota_format);
-out4:
- destroy_workqueue(ocfs2_wq);
- debugfs_remove(ocfs2_debugfs_root);
out3:
+ debugfs_remove(ocfs2_debugfs_root);
ocfs2_free_mem_caches();
out2:
exit_ocfs2_uptodate_cache();
@@ -1649,11 +1636,6 @@ out1:
static void __exit ocfs2_exit(void)
{
- if (ocfs2_wq) {
- flush_workqueue(ocfs2_wq);
- destroy_workqueue(ocfs2_wq);
- }
-
unregister_quota_format(&ocfs2_quota_format);
debugfs_remove(ocfs2_debugfs_root);
@@ -1671,6 +1653,7 @@ static void ocfs2_put_super(struct super_block *sb)
ocfs2_sync_blockdev(sb);
ocfs2_dismount_volume(sb, 0);
+ ocfs2_filecheck_remove_sysfs(sb);
}
static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1726,8 +1709,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
ocfs2_inode_unlock(inode, 0);
status = 0;
bail:
- if (inode)
- iput(inode);
+ iput(inode);
if (status)
mlog_errno(status);
@@ -1744,8 +1726,8 @@ static void ocfs2_inode_init_once(void *data)
spin_lock_init(&oi->ip_lock);
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
+ INIT_LIST_HEAD(&oi->ip_unwritten_list);
oi->ip_dir_start_lookup = 0;
- mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
@@ -1771,7 +1753,7 @@ static int ocfs2_initialize_mem_caches(void)
sizeof(struct ocfs2_inode_info),
0,
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
ocfs2_inode_init_once);
ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
sizeof(struct ocfs2_dquot),
@@ -2348,6 +2330,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
cleancache_init_shared_fs(sb);
+ osb->ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
+ if (!osb->ocfs2_wq) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ }
+
bail:
return status;
}
@@ -2535,6 +2523,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
{
/* This function assumes that the caller has the main osb resource */
+ /* ocfs2_initializer_super have already created this workqueue */
+ if (osb->ocfs2_wq) {
+ flush_workqueue(osb->ocfs2_wq);
+ destroy_workqueue(osb->ocfs2_wq);
+ }
+
ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index b477d0b1c7b6..b023e4f3d740 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -26,8 +26,6 @@
#ifndef OCFS2_SUPER_H
#define OCFS2_SUPER_H
-extern struct workqueue_struct *ocfs2_wq;
-
int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 66edce7ecfd7..6c2a3e3c521c 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -88,8 +88,7 @@ const struct address_space_operations ocfs2_fast_symlink_aops = {
const struct inode_operations ocfs2_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
.setxattr = generic_setxattr,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ebfdea78659b..7d3d979f57d9 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -544,8 +544,7 @@ static inline const char *ocfs2_xattr_prefix(int name_index)
if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
handler = ocfs2_xattr_handler_map[name_index];
-
- return handler ? handler->prefix : NULL;
+ return handler ? xattr_prefix(handler) : NULL;
}
static u32 ocfs2_xattr_name_hash(struct inode *inode,
@@ -884,14 +883,39 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
return ret;
}
-static int ocfs2_xattr_list_entry(char *buffer, size_t size,
- size_t *result, const char *prefix,
+static int ocfs2_xattr_list_entry(struct super_block *sb,
+ char *buffer, size_t size,
+ size_t *result, int type,
const char *name, int name_len)
{
char *p = buffer + *result;
- int prefix_len = strlen(prefix);
- int total_len = prefix_len + name_len + 1;
+ const char *prefix;
+ int prefix_len;
+ int total_len;
+ switch(type) {
+ case OCFS2_XATTR_INDEX_USER:
+ if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return 0;
+ break;
+
+ case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
+ case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
+ if (!(sb->s_flags & MS_POSIXACL))
+ return 0;
+ break;
+
+ case OCFS2_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+ break;
+ }
+
+ prefix = ocfs2_xattr_prefix(type);
+ if (!prefix)
+ return 0;
+ prefix_len = strlen(prefix);
+ total_len = prefix_len + name_len + 1;
*result += total_len;
/* we are just looking for how big our buffer needs to be */
@@ -914,23 +938,20 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
{
size_t result = 0;
int i, type, ret;
- const char *prefix, *name;
+ const char *name;
for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
- prefix = ocfs2_xattr_prefix(type);
-
- if (prefix) {
- name = (const char *)header +
- le16_to_cpu(entry->xe_name_offset);
+ name = (const char *)header +
+ le16_to_cpu(entry->xe_name_offset);
- ret = ocfs2_xattr_list_entry(buffer, buffer_size,
- &result, prefix, name,
- entry->xe_name_len);
- if (ret)
- return ret;
- }
+ ret = ocfs2_xattr_list_entry(inode->i_sb,
+ buffer, buffer_size,
+ &result, type, name,
+ entry->xe_name_len);
+ if (ret)
+ return ret;
}
return result;
@@ -2503,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
mlog_errno(ret);
goto out;
}
- mutex_lock(&xb_alloc_inode->i_mutex);
+ inode_lock(xb_alloc_inode);
ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
if (ret < 0) {
@@ -2528,7 +2549,7 @@ out_unlock:
ocfs2_inode_unlock(xb_alloc_inode, 1);
brelse(xb_alloc_bh);
out_mutex:
- mutex_unlock(&xb_alloc_inode->i_mutex);
+ inode_unlock(xb_alloc_inode);
iput(xb_alloc_inode);
out:
brelse(blk_bh);
@@ -3598,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode,
}
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
if (ret < 0) {
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
mlog_errno(ret);
goto cleanup;
}
}
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
&xbs, &ctxt, ref_meta, &credits);
@@ -4033,32 +4054,30 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
int ret = 0, type;
struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
int i, block_off, new_offset;
- const char *prefix, *name;
+ const char *name;
for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
- prefix = ocfs2_xattr_prefix(type);
- if (prefix) {
- ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
- bucket_xh(bucket),
- i,
- &block_off,
- &new_offset);
- if (ret)
- break;
+ ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+ bucket_xh(bucket),
+ i,
+ &block_off,
+ &new_offset);
+ if (ret)
+ break;
- name = (const char *)bucket_block(bucket, block_off) +
- new_offset;
- ret = ocfs2_xattr_list_entry(xl->buffer,
- xl->buffer_size,
- &xl->result,
- prefix, name,
- entry->xe_name_len);
- if (ret)
- break;
- }
+ name = (const char *)bucket_block(bucket, block_off) +
+ new_offset;
+ ret = ocfs2_xattr_list_entry(inode->i_sb,
+ xl->buffer,
+ xl->buffer_size,
+ &xl->result,
+ type, name,
+ entry->xe_name_len);
+ if (ret)
+ break;
}
return ret;
@@ -5441,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
return ret;
}
- mutex_lock(&tl_inode->i_mutex);
+ inode_lock(tl_inode);
if (ocfs2_truncate_log_needs_flush(osb)) {
ret = __ocfs2_flush_truncate_log(osb);
@@ -5485,7 +5504,7 @@ out_commit:
out:
ocfs2_schedule_truncate_log_flush(osb, 1);
- mutex_unlock(&tl_inode->i_mutex);
+ inode_unlock(tl_inode);
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
@@ -7226,39 +7245,22 @@ int ocfs2_init_security_and_acl(struct inode *dir,
leave:
return ret;
}
+
/*
* 'security' attributes support
*/
-static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
name, buffer, size);
}
-static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
name, value, size, flags);
}
@@ -7311,7 +7313,6 @@ int ocfs2_init_security_set(handle_t *handle,
const struct xattr_handler ocfs2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = ocfs2_xattr_security_list,
.get = ocfs2_xattr_security_get,
.set = ocfs2_xattr_security_set,
};
@@ -7319,46 +7320,24 @@ const struct xattr_handler ocfs2_xattr_security_handler = {
/*
* 'trusted' attributes support
*/
-static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
name, buffer, size);
}
-static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
name, value, size, flags);
}
const struct xattr_handler ocfs2_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = ocfs2_xattr_trusted_list,
.get = ocfs2_xattr_trusted_get,
.set = ocfs2_xattr_trusted_set,
};
@@ -7366,45 +7345,24 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
/*
* 'user' attributes support
*/
-static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
-{
- const size_t prefix_len = XATTR_USER_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-
- if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
- return 0;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_USER_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
- return total_len;
-}
-
-static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name,
buffer, size);
}
-static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- if (strcmp(name, "") == 0)
- return -EINVAL;
if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
return -EOPNOTSUPP;
@@ -7414,7 +7372,6 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
const struct xattr_handler ocfs2_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = ocfs2_xattr_user_list,
.get = ocfs2_xattr_user_get,
.set = ocfs2_xattr_user_set,
};
diff --git a/fs/open.c b/fs/open.c
index b6f1e96a7c0b..17cb6b1dab75 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -58,10 +58,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
if (ret)
newattrs.ia_valid |= ret | ATTR_FORCE;
- mutex_lock(&dentry->d_inode->i_mutex);
+ inode_lock(dentry->d_inode);
/* Note any delegations or leases have already been broken: */
ret = notify_change(dentry, &newattrs, NULL);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(dentry->d_inode);
return ret;
}
@@ -510,7 +510,7 @@ static int chmod_common(struct path *path, umode_t mode)
if (error)
return error;
retry_deleg:
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = security_path_chmod(path, mode);
if (error)
goto out_unlock;
@@ -518,7 +518,7 @@ retry_deleg:
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
error = notify_change(path->dentry, &newattrs, &delegated_inode);
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
@@ -593,11 +593,11 @@ retry_deleg:
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = security_path_chown(path, uid, gid);
if (!error)
error = notify_change(path->dentry, &newattrs, &delegated_inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
@@ -887,7 +887,7 @@ EXPORT_SYMBOL(dentry_open);
static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
{
int lookup_flags = 0;
- int acc_mode;
+ int acc_mode = ACC_MODE(flags);
if (flags & (O_CREAT | __O_TMPFILE))
op->mode = (mode & S_IALLUGO) | S_IFREG;
@@ -909,7 +909,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
if (flags & __O_TMPFILE) {
if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
return -EINVAL;
- acc_mode = MAY_OPEN | ACC_MODE(flags);
if (!(acc_mode & MAY_WRITE))
return -EINVAL;
} else if (flags & O_PATH) {
@@ -919,8 +918,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
*/
flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
acc_mode = 0;
- } else {
- acc_mode = MAY_OPEN | ACC_MODE(flags);
}
op->open_flag = flags;
@@ -995,14 +992,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
EXPORT_SYMBOL(filp_open);
struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
- const char *filename, int flags)
+ const char *filename, int flags, umode_t mode)
{
struct open_flags op;
- int err = build_open_flags(flags, 0, &op);
+ int err = build_open_flags(flags, mode, &op);
if (err)
return ERR_PTR(err);
- if (flags & O_CREAT)
- return ERR_PTR(-EINVAL);
return do_file_open_root(dentry, mnt, filename, &op);
}
EXPORT_SYMBOL(file_open_root);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 15e4500cda3e..b61b883c8ff8 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -443,7 +443,7 @@ static int __init init_openprom_fs(void)
sizeof(struct op_inode_info),
0,
(SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD | SLAB_ACCOUNT),
op_inode_init_once);
if (!op_inode_cachep)
return -ENOMEM;
diff --git a/fs/orangefs/Kconfig b/fs/orangefs/Kconfig
new file mode 100644
index 000000000000..1554c02489de
--- /dev/null
+++ b/fs/orangefs/Kconfig
@@ -0,0 +1,6 @@
+config ORANGEFS_FS
+ tristate "ORANGEFS (Powered by PVFS) support"
+ select FS_POSIX_ACL
+ help
+ Orange is a parallel file system designed for use on high end
+ computing (HEC) systems.
diff --git a/fs/orangefs/Makefile b/fs/orangefs/Makefile
new file mode 100644
index 000000000000..a9d6a968fe6d
--- /dev/null
+++ b/fs/orangefs/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the ORANGEFS filesystem.
+#
+
+obj-$(CONFIG_ORANGEFS_FS) += orangefs.o
+
+orangefs-objs := acl.o file.o orangefs-cache.o orangefs-utils.o xattr.o \
+ dcache.o inode.o orangefs-sysfs.o orangefs-mod.o super.o \
+ devorangefs-req.o namei.o symlink.o dir.o orangefs-bufmap.o \
+ orangefs-debugfs.o waitqueue.o
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
new file mode 100644
index 000000000000..03f89dbb2512
--- /dev/null
+++ b/fs/orangefs/acl.c
@@ -0,0 +1,175 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/fs_struct.h>
+
+struct posix_acl *orangefs_get_acl(struct inode *inode, int type)
+{
+ struct posix_acl *acl;
+ int ret;
+ char *key = NULL, *value = NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ key = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ key = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+ break;
+ default:
+ gossip_err("orangefs_get_acl: bogus value of type %d\n", type);
+ return ERR_PTR(-EINVAL);
+ }
+ /*
+ * Rather than incurring a network call just to determine the exact
+ * length of the attribute, I just allocate a max length to save on
+ * the network call. Conceivably, we could pass NULL to
+ * orangefs_inode_getxattr() to probe the length of the value, but
+ * I don't do that for now.
+ */
+ value = kmalloc(ORANGEFS_MAX_XATTR_VALUELEN, GFP_KERNEL);
+ if (value == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ gossip_debug(GOSSIP_ACL_DEBUG,
+ "inode %pU, key %s, type %d\n",
+ get_khandle_from_ino(inode),
+ key,
+ type);
+ ret = orangefs_inode_getxattr(inode,
+ "",
+ key,
+ value,
+ ORANGEFS_MAX_XATTR_VALUELEN);
+ /* if the key exists, convert it to an in-memory rep */
+ if (ret > 0) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, ret);
+ } else if (ret == -ENODATA || ret == -ENOSYS) {
+ acl = NULL;
+ } else {
+ gossip_err("inode %pU retrieving acl's failed with error %d\n",
+ get_khandle_from_ino(inode),
+ ret);
+ acl = ERR_PTR(ret);
+ }
+ /* kfree(NULL) is safe, so don't worry if value ever got used */
+ kfree(value);
+ return acl;
+}
+
+int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ int error = 0;
+ void *value = NULL;
+ size_t size = 0;
+ const char *name = NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = ORANGEFS_XATTR_NAME_ACL_ACCESS;
+ if (acl) {
+ umode_t mode = inode->i_mode;
+ /*
+ * can we represent this with the traditional file
+ * mode permission bits?
+ */
+ error = posix_acl_equiv_mode(acl, &mode);
+ if (error < 0) {
+ gossip_err("%s: posix_acl_equiv_mode err: %d\n",
+ __func__,
+ error);
+ return error;
+ }
+
+ if (inode->i_mode != mode)
+ SetModeFlag(orangefs_inode);
+ inode->i_mode = mode;
+ mark_inode_dirty_sync(inode);
+ if (error == 0)
+ acl = NULL;
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = ORANGEFS_XATTR_NAME_ACL_DEFAULT;
+ break;
+ default:
+ gossip_err("%s: invalid type %d!\n", __func__, type);
+ return -EINVAL;
+ }
+
+ gossip_debug(GOSSIP_ACL_DEBUG,
+ "%s: inode %pU, key %s type %d\n",
+ __func__, get_khandle_from_ino(inode),
+ name,
+ type);
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_KERNEL);
+ if (!value)
+ return -ENOMEM;
+
+ error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (error < 0)
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_ACL_DEBUG,
+ "%s: name %s, value %p, size %zd, acl %p\n",
+ __func__, name, value, size, acl);
+ /*
+ * Go ahead and set the extended attribute now. NOTE: Suppose acl
+ * was NULL, then value will be NULL and size will be 0 and that
+ * will xlate to a removexattr. However, we don't want removexattr
+ * complain if attributes does not exist.
+ */
+ error = orangefs_inode_setxattr(inode, "", name, value, size, 0);
+
+out:
+ kfree(value);
+ if (!error)
+ set_cached_acl(inode, type, acl);
+ return error;
+}
+
+int orangefs_init_acl(struct inode *inode, struct inode *dir)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct posix_acl *default_acl, *acl;
+ umode_t mode = inode->i_mode;
+ int error = 0;
+
+ ClearModeFlag(orangefs_inode);
+
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error)
+ return error;
+
+ if (default_acl) {
+ error = orangefs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
+ }
+
+ if (acl) {
+ if (!error)
+ error = orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
+
+ /* If mode of the inode was changed, then do a forcible ->setattr */
+ if (mode != inode->i_mode) {
+ SetModeFlag(orangefs_inode);
+ inode->i_mode = mode;
+ orangefs_flush_inode(inode);
+ }
+
+ return error;
+}
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
new file mode 100644
index 000000000000..5dfc4f3cfe68
--- /dev/null
+++ b/fs/orangefs/dcache.c
@@ -0,0 +1,138 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Implementation of dentry (directory cache) functions.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+/* Returns 1 if dentry can still be trusted, else 0. */
+static int orangefs_revalidate_lookup(struct dentry *dentry)
+{
+ struct dentry *parent_dentry = dget_parent(dentry);
+ struct inode *parent_inode = parent_dentry->d_inode;
+ struct orangefs_inode_s *parent = ORANGEFS_I(parent_inode);
+ struct inode *inode = dentry->d_inode;
+ struct orangefs_kernel_op_s *new_op;
+ int ret = 0;
+ int err = 0;
+
+ gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
+ if (!new_op)
+ goto out_put_parent;
+
+ new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
+ new_op->upcall.req.lookup.parent_refn = parent->refn;
+ strncpy(new_op->upcall.req.lookup.d_name,
+ dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s:%s:%d interrupt flag [%d]\n",
+ __FILE__,
+ __func__,
+ __LINE__,
+ get_interruptible_flag(parent_inode));
+
+ err = service_operation(new_op, "orangefs_lookup",
+ get_interruptible_flag(parent_inode));
+
+ /* Positive dentry: reject if error or not the same inode. */
+ if (inode) {
+ if (err) {
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s:%s:%d lookup failure.\n",
+ __FILE__, __func__, __LINE__);
+ goto out_drop;
+ }
+ if (!match_handle(new_op->downcall.resp.lookup.refn.khandle,
+ inode)) {
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s:%s:%d no match.\n",
+ __FILE__, __func__, __LINE__);
+ goto out_drop;
+ }
+
+ /* Negative dentry: reject if success or error other than ENOENT. */
+ } else {
+ gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: negative dentry.\n",
+ __func__);
+ if (!err || err != -ENOENT) {
+ if (new_op->downcall.status != 0)
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s:%s:%d lookup failure.\n",
+ __FILE__, __func__, __LINE__);
+ goto out_drop;
+ }
+ }
+
+ ret = 1;
+out_release_op:
+ op_release(new_op);
+out_put_parent:
+ dput(parent_dentry);
+ return ret;
+out_drop:
+ gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d revalidate failed\n",
+ __FILE__, __func__, __LINE__);
+ goto out_release_op;
+}
+
+/*
+ * Verify that dentry is valid.
+ *
+ * Should return 1 if dentry can still be trusted, else 0.
+ */
+static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ int ret;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: called on dentry %p.\n",
+ __func__, dentry);
+
+ /* skip root handle lookups. */
+ if (dentry->d_inode && is_root_handle(dentry->d_inode))
+ return 1;
+
+ /*
+ * If this passes, the positive dentry still exists or the negative
+ * dentry still does not exist.
+ */
+ if (!orangefs_revalidate_lookup(dentry))
+ return 0;
+
+ /* We do not need to continue with negative dentries. */
+ if (!dentry->d_inode)
+ goto out;
+
+ /* Now we must perform a getattr to validate the inode contents. */
+
+ ret = orangefs_inode_check_changed(dentry->d_inode);
+ if (ret < 0) {
+ gossip_debug(GOSSIP_DCACHE_DEBUG, "%s:%s:%d getattr failure.\n",
+ __FILE__, __func__, __LINE__);
+ return 0;
+ }
+ if (ret == 0)
+ return 0;
+
+out:
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s: negative dentry or positive dentry and inode valid.\n",
+ __func__);
+ return 1;
+}
+
+const struct dentry_operations orangefs_dentry_operations = {
+ .d_revalidate = orangefs_d_revalidate,
+};
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
new file mode 100644
index 000000000000..db170beba797
--- /dev/null
+++ b/fs/orangefs/devorangefs-req.c
@@ -0,0 +1,943 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * Changes by Acxiom Corporation to add protocol version to kernel
+ * communication, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-dev-proto.h"
+#include "orangefs-bufmap.h"
+
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+/* this file implements the /dev/pvfs2-req device node */
+
+static int open_access_count;
+
+#define DUMP_DEVICE_ERROR() \
+do { \
+ gossip_err("*****************************************************\n");\
+ gossip_err("ORANGEFS Device Error: You cannot open the device file "); \
+ gossip_err("\n/dev/%s more than once. Please make sure that\nthere " \
+ "are no ", ORANGEFS_REQDEVICE_NAME); \
+ gossip_err("instances of a program using this device\ncurrently " \
+ "running. (You must verify this!)\n"); \
+ gossip_err("For example, you can use the lsof program as follows:\n");\
+ gossip_err("'lsof | grep %s' (run this as root)\n", \
+ ORANGEFS_REQDEVICE_NAME); \
+ gossip_err(" open_access_count = %d\n", open_access_count); \
+ gossip_err("*****************************************************\n");\
+} while (0)
+
+static int hash_func(__u64 tag, int table_size)
+{
+ return do_div(tag, (unsigned int)table_size);
+}
+
+static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
+{
+ int index = hash_func(op->tag, hash_table_size);
+
+ list_add_tail(&op->list, &htable_ops_in_progress[index]);
+}
+
+/*
+ * find the op with this tag and remove it from the in progress
+ * hash table.
+ */
+static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
+{
+ struct orangefs_kernel_op_s *op, *next;
+ int index;
+
+ index = hash_func(tag, hash_table_size);
+
+ spin_lock(&htable_ops_in_progress_lock);
+ list_for_each_entry_safe(op,
+ next,
+ &htable_ops_in_progress[index],
+ list) {
+ if (op->tag == tag && !op_state_purged(op) &&
+ !op_state_given_up(op)) {
+ list_del_init(&op->list);
+ spin_unlock(&htable_ops_in_progress_lock);
+ return op;
+ }
+ }
+
+ spin_unlock(&htable_ops_in_progress_lock);
+ return NULL;
+}
+
+/* Returns whether any FS are still pending remounted */
+static int mark_all_pending_mounts(void)
+{
+ int unmounted = 1;
+ struct orangefs_sb_info_s *orangefs_sb = NULL;
+
+ spin_lock(&orangefs_superblocks_lock);
+ list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+ /* All of these file system require a remount */
+ orangefs_sb->mount_pending = 1;
+ unmounted = 0;
+ }
+ spin_unlock(&orangefs_superblocks_lock);
+ return unmounted;
+}
+
+/*
+ * Determine if a given file system needs to be remounted or not
+ * Returns -1 on error
+ * 0 if already mounted
+ * 1 if needs remount
+ */
+static int fs_mount_pending(__s32 fsid)
+{
+ int mount_pending = -1;
+ struct orangefs_sb_info_s *orangefs_sb = NULL;
+
+ spin_lock(&orangefs_superblocks_lock);
+ list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+ if (orangefs_sb->fs_id == fsid) {
+ mount_pending = orangefs_sb->mount_pending;
+ break;
+ }
+ }
+ spin_unlock(&orangefs_superblocks_lock);
+ return mount_pending;
+}
+
+static int orangefs_devreq_open(struct inode *inode, struct file *file)
+{
+ int ret = -EINVAL;
+
+ if (!(file->f_flags & O_NONBLOCK)) {
+ gossip_err("%s: device cannot be opened in blocking mode\n",
+ __func__);
+ goto out;
+ }
+ ret = -EACCES;
+ gossip_debug(GOSSIP_DEV_DEBUG, "client-core: opening device\n");
+ mutex_lock(&devreq_mutex);
+
+ if (open_access_count == 0) {
+ open_access_count = 1;
+ ret = 0;
+ } else {
+ DUMP_DEVICE_ERROR();
+ }
+ mutex_unlock(&devreq_mutex);
+
+out:
+
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "pvfs2-client-core: open device complete (ret = %d)\n",
+ ret);
+ return ret;
+}
+
+/* Function for read() callers into the device */
+static ssize_t orangefs_devreq_read(struct file *file,
+ char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct orangefs_kernel_op_s *op, *temp;
+ __s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION;
+ static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
+ struct orangefs_kernel_op_s *cur_op = NULL;
+ unsigned long ret;
+
+ /* We do not support blocking IO. */
+ if (!(file->f_flags & O_NONBLOCK)) {
+ gossip_err("%s: blocking read from client-core.\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * The client will do an ioctl to find MAX_DEV_REQ_UPSIZE, then
+ * always read with that size buffer.
+ */
+ if (count != MAX_DEV_REQ_UPSIZE) {
+ gossip_err("orangefs: client-core tried to read wrong size\n");
+ return -EINVAL;
+ }
+
+restart:
+ /* Get next op (if any) from top of list. */
+ spin_lock(&orangefs_request_list_lock);
+ list_for_each_entry_safe(op, temp, &orangefs_request_list, list) {
+ __s32 fsid;
+ /* This lock is held past the end of the loop when we break. */
+ spin_lock(&op->lock);
+ if (unlikely(op_state_purged(op) || op_state_given_up(op))) {
+ spin_unlock(&op->lock);
+ continue;
+ }
+
+ fsid = fsid_of_op(op);
+ if (fsid != ORANGEFS_FS_ID_NULL) {
+ int ret;
+ /* Skip ops whose filesystem needs to be mounted. */
+ ret = fs_mount_pending(fsid);
+ if (ret == 1) {
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: mount pending, skipping op tag "
+ "%llu %s\n",
+ __func__,
+ llu(op->tag),
+ get_opname_string(op));
+ spin_unlock(&op->lock);
+ continue;
+ /*
+ * Skip ops whose filesystem we don't know about unless
+ * it is being mounted.
+ */
+ /* XXX: is there a better way to detect this? */
+ } else if (ret == -1 &&
+ !(op->upcall.type ==
+ ORANGEFS_VFS_OP_FS_MOUNT ||
+ op->upcall.type ==
+ ORANGEFS_VFS_OP_GETATTR)) {
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "orangefs: skipping op tag %llu %s\n",
+ llu(op->tag), get_opname_string(op));
+ gossip_err(
+ "orangefs: ERROR: fs_mount_pending %d\n",
+ fsid);
+ spin_unlock(&op->lock);
+ continue;
+ }
+ }
+ /*
+ * Either this op does not pertain to a filesystem, is mounting
+ * a filesystem, or pertains to a mounted filesystem. Let it
+ * through.
+ */
+ cur_op = op;
+ break;
+ }
+
+ /*
+ * At this point we either have a valid op and can continue or have not
+ * found an op and must ask the client to try again later.
+ */
+ if (!cur_op) {
+ spin_unlock(&orangefs_request_list_lock);
+ return -EAGAIN;
+ }
+
+ gossip_debug(GOSSIP_DEV_DEBUG, "%s: reading op tag %llu %s\n",
+ __func__,
+ llu(cur_op->tag),
+ get_opname_string(cur_op));
+
+ /*
+ * Such an op should never be on the list in the first place. If so, we
+ * will abort.
+ */
+ if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
+ gossip_err("orangefs: ERROR: Current op already queued.\n");
+ list_del_init(&cur_op->list);
+ spin_unlock(&cur_op->lock);
+ spin_unlock(&orangefs_request_list_lock);
+ return -EAGAIN;
+ }
+
+ list_del_init(&cur_op->list);
+ spin_unlock(&orangefs_request_list_lock);
+
+ spin_unlock(&cur_op->lock);
+
+ /* Push the upcall out. */
+ ret = copy_to_user(buf, &proto_ver, sizeof(__s32));
+ if (ret != 0)
+ goto error;
+ ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32));
+ if (ret != 0)
+ goto error;
+ ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64));
+ if (ret != 0)
+ goto error;
+ ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall,
+ sizeof(struct orangefs_upcall_s));
+ if (ret != 0)
+ goto error;
+
+ spin_lock(&htable_ops_in_progress_lock);
+ spin_lock(&cur_op->lock);
+ if (unlikely(op_state_given_up(cur_op))) {
+ spin_unlock(&cur_op->lock);
+ spin_unlock(&htable_ops_in_progress_lock);
+ complete(&cur_op->waitq);
+ goto restart;
+ }
+
+ /*
+ * Set the operation to be in progress and move it between lists since
+ * it has been sent to the client.
+ */
+ set_op_state_inprogress(cur_op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: 1 op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(cur_op),
+ cur_op->op_state,
+ current->comm);
+ orangefs_devreq_add_op(cur_op);
+ spin_unlock(&cur_op->lock);
+ spin_unlock(&htable_ops_in_progress_lock);
+
+ /* The client only asks to read one size buffer. */
+ return MAX_DEV_REQ_UPSIZE;
+error:
+ /*
+ * We were unable to copy the op data to the client. Put the op back in
+ * list. If client has crashed, the op will be purged later when the
+ * device is released.
+ */
+ gossip_err("orangefs: Failed to copy data to user space\n");
+ spin_lock(&orangefs_request_list_lock);
+ spin_lock(&cur_op->lock);
+ if (likely(!op_state_given_up(cur_op))) {
+ set_op_state_waiting(cur_op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: 2 op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(cur_op),
+ cur_op->op_state,
+ current->comm);
+ list_add(&cur_op->list, &orangefs_request_list);
+ spin_unlock(&cur_op->lock);
+ } else {
+ spin_unlock(&cur_op->lock);
+ complete(&cur_op->waitq);
+ }
+ spin_unlock(&orangefs_request_list_lock);
+ return -EFAULT;
+}
+
+/*
+ * Function for writev() callers into the device.
+ *
+ * Userspace should have written:
+ * - __u32 version
+ * - __u32 magic
+ * - __u64 tag
+ * - struct orangefs_downcall_s
+ * - trailer buffer (in the case of READDIR operations)
+ */
+static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ ssize_t ret;
+ struct orangefs_kernel_op_s *op = NULL;
+ struct {
+ __u32 version;
+ __u32 magic;
+ __u64 tag;
+ } head;
+ int total = ret = iov_iter_count(iter);
+ int n;
+ int downcall_size = sizeof(struct orangefs_downcall_s);
+ int head_size = sizeof(head);
+
+ gossip_debug(GOSSIP_DEV_DEBUG, "%s: total:%d: ret:%zd:\n",
+ __func__,
+ total,
+ ret);
+
+ if (total < MAX_DEV_REQ_DOWNSIZE) {
+ gossip_err("%s: total:%d: must be at least:%u:\n",
+ __func__,
+ total,
+ (unsigned int) MAX_DEV_REQ_DOWNSIZE);
+ return -EFAULT;
+ }
+
+ n = copy_from_iter(&head, head_size, iter);
+ if (n < head_size) {
+ gossip_err("%s: failed to copy head.\n", __func__);
+ return -EFAULT;
+ }
+
+ if (head.version < ORANGEFS_MINIMUM_USERSPACE_VERSION) {
+ gossip_err("%s: userspace claims version"
+ "%d, minimum version required: %d.\n",
+ __func__,
+ head.version,
+ ORANGEFS_MINIMUM_USERSPACE_VERSION);
+ return -EPROTO;
+ }
+
+ if (head.magic != ORANGEFS_DEVREQ_MAGIC) {
+ gossip_err("Error: Device magic number does not match.\n");
+ return -EPROTO;
+ }
+
+ /* remove the op from the in progress hash table */
+ op = orangefs_devreq_remove_op(head.tag);
+ if (!op) {
+ gossip_err("WARNING: No one's waiting for tag %llu\n",
+ llu(head.tag));
+ return ret;
+ }
+
+ n = copy_from_iter(&op->downcall, downcall_size, iter);
+ if (n != downcall_size) {
+ gossip_err("%s: failed to copy downcall.\n", __func__);
+ goto Efault;
+ }
+
+ if (op->downcall.status)
+ goto wakeup;
+
+ /*
+ * We've successfully peeled off the head and the downcall.
+ * Something has gone awry if total doesn't equal the
+ * sum of head_size, downcall_size and trailer_size.
+ */
+ if ((head_size + downcall_size + op->downcall.trailer_size) != total) {
+ gossip_err("%s: funky write, head_size:%d"
+ ": downcall_size:%d: trailer_size:%lld"
+ ": total size:%d:\n",
+ __func__,
+ head_size,
+ downcall_size,
+ op->downcall.trailer_size,
+ total);
+ goto Efault;
+ }
+
+ /* Only READDIR operations should have trailers. */
+ if ((op->downcall.type != ORANGEFS_VFS_OP_READDIR) &&
+ (op->downcall.trailer_size != 0)) {
+ gossip_err("%s: %x operation with trailer.",
+ __func__,
+ op->downcall.type);
+ goto Efault;
+ }
+
+ /* READDIR operations should always have trailers. */
+ if ((op->downcall.type == ORANGEFS_VFS_OP_READDIR) &&
+ (op->downcall.trailer_size == 0)) {
+ gossip_err("%s: %x operation with no trailer.",
+ __func__,
+ op->downcall.type);
+ goto Efault;
+ }
+
+ if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
+ goto wakeup;
+
+ op->downcall.trailer_buf =
+ vmalloc(op->downcall.trailer_size);
+ if (op->downcall.trailer_buf == NULL) {
+ gossip_err("%s: failed trailer vmalloc.\n",
+ __func__);
+ goto Enomem;
+ }
+ memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
+ n = copy_from_iter(op->downcall.trailer_buf,
+ op->downcall.trailer_size,
+ iter);
+ if (n != op->downcall.trailer_size) {
+ gossip_err("%s: failed to copy trailer.\n", __func__);
+ vfree(op->downcall.trailer_buf);
+ goto Efault;
+ }
+
+wakeup:
+ /*
+ * Return to vfs waitqueue, and back to service_operation
+ * through wait_for_matching_downcall.
+ */
+ spin_lock(&op->lock);
+ if (unlikely(op_is_cancel(op))) {
+ spin_unlock(&op->lock);
+ put_cancel(op);
+ } else if (unlikely(op_state_given_up(op))) {
+ spin_unlock(&op->lock);
+ complete(&op->waitq);
+ } else {
+ set_op_state_serviced(op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(op),
+ op->op_state,
+ current->comm);
+ spin_unlock(&op->lock);
+ }
+ return ret;
+
+Efault:
+ op->downcall.status = -(ORANGEFS_ERROR_BIT | 9);
+ ret = -EFAULT;
+ goto wakeup;
+
+Enomem:
+ op->downcall.status = -(ORANGEFS_ERROR_BIT | 8);
+ ret = -ENOMEM;
+ goto wakeup;
+}
+
+/*
+ * NOTE: gets called when the last reference to this device is dropped.
+ * Using the open_access_count variable, we enforce a reference count
+ * on this file so that it can be opened by only one process at a time.
+ * the devreq_mutex is used to make sure all i/o has completed
+ * before we call orangefs_bufmap_finalize, and similar such tricky
+ * situations
+ */
+static int orangefs_devreq_release(struct inode *inode, struct file *file)
+{
+ int unmounted = 0;
+
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s:pvfs2-client-core: exiting, closing device\n",
+ __func__);
+
+ mutex_lock(&devreq_mutex);
+ orangefs_bufmap_finalize();
+
+ open_access_count = -1;
+
+ unmounted = mark_all_pending_mounts();
+ gossip_debug(GOSSIP_DEV_DEBUG, "ORANGEFS Device Close: Filesystem(s) %s\n",
+ (unmounted ? "UNMOUNTED" : "MOUNTED"));
+
+ purge_waiting_ops();
+ purge_inprogress_ops();
+
+ orangefs_bufmap_run_down();
+
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "pvfs2-client-core: device close complete\n");
+ open_access_count = 0;
+ mutex_unlock(&devreq_mutex);
+ return 0;
+}
+
+int is_daemon_in_service(void)
+{
+ int in_service;
+
+ /*
+ * What this function does is checks if client-core is alive
+ * based on the access count we maintain on the device.
+ */
+ mutex_lock(&devreq_mutex);
+ in_service = open_access_count == 1 ? 0 : -EIO;
+ mutex_unlock(&devreq_mutex);
+ return in_service;
+}
+
+bool __is_daemon_in_service(void)
+{
+ return open_access_count == 1;
+}
+
+static inline long check_ioctl_command(unsigned int command)
+{
+ /* Check for valid ioctl codes */
+ if (_IOC_TYPE(command) != ORANGEFS_DEV_MAGIC) {
+ gossip_err("device ioctl magic numbers don't match! Did you rebuild pvfs2-client-core/libpvfs2? [cmd %x, magic %x != %x]\n",
+ command,
+ _IOC_TYPE(command),
+ ORANGEFS_DEV_MAGIC);
+ return -EINVAL;
+ }
+ /* and valid ioctl commands */
+ if (_IOC_NR(command) >= ORANGEFS_DEV_MAXNR || _IOC_NR(command) <= 0) {
+ gossip_err("Invalid ioctl command number [%d >= %d]\n",
+ _IOC_NR(command), ORANGEFS_DEV_MAXNR);
+ return -ENOIOCTLCMD;
+ }
+ return 0;
+}
+
+static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
+{
+ static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
+ static __s32 max_up_size = MAX_DEV_REQ_UPSIZE;
+ static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE;
+ struct ORANGEFS_dev_map_desc user_desc;
+ int ret = 0;
+ struct dev_mask_info_s mask_info = { 0 };
+ struct dev_mask2_info_s mask2_info = { 0, 0 };
+ int upstream_kmod = 1;
+ struct orangefs_sb_info_s *orangefs_sb;
+
+ /* mtmoore: add locking here */
+
+ switch (command) {
+ case ORANGEFS_DEV_GET_MAGIC:
+ return ((put_user(magic, (__s32 __user *) arg) == -EFAULT) ?
+ -EIO :
+ 0);
+ case ORANGEFS_DEV_GET_MAX_UPSIZE:
+ return ((put_user(max_up_size,
+ (__s32 __user *) arg) == -EFAULT) ?
+ -EIO :
+ 0);
+ case ORANGEFS_DEV_GET_MAX_DOWNSIZE:
+ return ((put_user(max_down_size,
+ (__s32 __user *) arg) == -EFAULT) ?
+ -EIO :
+ 0);
+ case ORANGEFS_DEV_MAP:
+ ret = copy_from_user(&user_desc,
+ (struct ORANGEFS_dev_map_desc __user *)
+ arg,
+ sizeof(struct ORANGEFS_dev_map_desc));
+ /* WTF -EIO and not -EFAULT? */
+ return ret ? -EIO : orangefs_bufmap_initialize(&user_desc);
+ case ORANGEFS_DEV_REMOUNT_ALL:
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: got ORANGEFS_DEV_REMOUNT_ALL\n",
+ __func__);
+
+ /*
+ * remount all mounted orangefs volumes to regain the lost
+ * dynamic mount tables (if any) -- NOTE: this is done
+ * without keeping the superblock list locked due to the
+ * upcall/downcall waiting. also, the request mutex is
+ * used to ensure that no operations will be serviced until
+ * all of the remounts are serviced (to avoid ops between
+ * mounts to fail)
+ */
+ ret = mutex_lock_interruptible(&request_mutex);
+ if (ret < 0)
+ return ret;
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: priority remount in progress\n",
+ __func__);
+ spin_lock(&orangefs_superblocks_lock);
+ list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
+ /*
+ * We have to drop the spinlock, so entries can be
+ * removed. They can't be freed, though, so we just
+ * keep the forward pointers and zero the back ones -
+ * that way we can get to the rest of the list.
+ */
+ if (!orangefs_sb->list.prev)
+ continue;
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: Remounting SB %p\n",
+ __func__,
+ orangefs_sb);
+
+ spin_unlock(&orangefs_superblocks_lock);
+ ret = orangefs_remount(orangefs_sb);
+ spin_lock(&orangefs_superblocks_lock);
+ if (ret) {
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "SB %p remount failed\n",
+ orangefs_sb);
+ break;
+ }
+ }
+ spin_unlock(&orangefs_superblocks_lock);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: priority remount complete\n",
+ __func__);
+ mutex_unlock(&request_mutex);
+ return ret;
+
+ case ORANGEFS_DEV_UPSTREAM:
+ ret = copy_to_user((void __user *)arg,
+ &upstream_kmod,
+ sizeof(upstream_kmod));
+
+ if (ret != 0)
+ return -EIO;
+ else
+ return ret;
+
+ case ORANGEFS_DEV_CLIENT_MASK:
+ ret = copy_from_user(&mask2_info,
+ (void __user *)arg,
+ sizeof(struct dev_mask2_info_s));
+
+ if (ret != 0)
+ return -EIO;
+
+ client_debug_mask.mask1 = mask2_info.mask1_value;
+ client_debug_mask.mask2 = mask2_info.mask2_value;
+
+ pr_info("%s: client debug mask has been been received "
+ ":%llx: :%llx:\n",
+ __func__,
+ (unsigned long long)client_debug_mask.mask1,
+ (unsigned long long)client_debug_mask.mask2);
+
+ return ret;
+
+ case ORANGEFS_DEV_CLIENT_STRING:
+ ret = copy_from_user(&client_debug_array_string,
+ (void __user *)arg,
+ ORANGEFS_MAX_DEBUG_STRING_LEN);
+ /*
+ * The real client-core makes an effort to ensure
+ * that actual strings that aren't too long to fit in
+ * this buffer is what we get here. We're going to use
+ * string functions on the stuff we got, so we'll make
+ * this extra effort to try and keep from
+ * flowing out of this buffer when we use the string
+ * functions, even if somehow the stuff we end up
+ * with here is garbage.
+ */
+ client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN - 1] =
+ '\0';
+
+ if (ret != 0) {
+ pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
+ __func__);
+ return -EIO;
+ }
+
+ pr_info("%s: client debug array string has been received.\n",
+ __func__);
+
+ if (!help_string_initialized) {
+
+ /* Free the "we don't know yet" default string... */
+ kfree(debug_help_string);
+
+ /* build a proper debug help string */
+ if (orangefs_prepare_debugfs_help_string(0)) {
+ gossip_err("%s: no debug help string \n",
+ __func__);
+ return -EIO;
+ }
+
+ /* Replace the boilerplate boot-time debug-help file. */
+ debugfs_remove(help_file_dentry);
+
+ help_file_dentry =
+ debugfs_create_file(
+ ORANGEFS_KMOD_DEBUG_HELP_FILE,
+ 0444,
+ debug_dir,
+ debug_help_string,
+ &debug_help_fops);
+
+ if (!help_file_dentry) {
+ gossip_err("%s: debugfs_create_file failed for"
+ " :%s:!\n",
+ __func__,
+ ORANGEFS_KMOD_DEBUG_HELP_FILE);
+ return -EIO;
+ }
+ }
+
+ debug_mask_to_string(&client_debug_mask, 1);
+
+ debugfs_remove(client_debug_dentry);
+
+ orangefs_client_debug_init();
+
+ help_string_initialized++;
+
+ return ret;
+
+ case ORANGEFS_DEV_DEBUG:
+ ret = copy_from_user(&mask_info,
+ (void __user *)arg,
+ sizeof(mask_info));
+
+ if (ret != 0)
+ return -EIO;
+
+ if (mask_info.mask_type == KERNEL_MASK) {
+ if ((mask_info.mask_value == 0)
+ && (kernel_mask_set_mod_init)) {
+ /*
+ * the kernel debug mask was set when the
+ * kernel module was loaded; don't override
+ * it if the client-core was started without
+ * a value for ORANGEFS_KMODMASK.
+ */
+ return 0;
+ }
+ debug_mask_to_string(&mask_info.mask_value,
+ mask_info.mask_type);
+ gossip_debug_mask = mask_info.mask_value;
+ pr_info("%s: kernel debug mask has been modified to "
+ ":%s: :%llx:\n",
+ __func__,
+ kernel_debug_string,
+ (unsigned long long)gossip_debug_mask);
+ } else if (mask_info.mask_type == CLIENT_MASK) {
+ debug_mask_to_string(&mask_info.mask_value,
+ mask_info.mask_type);
+ pr_info("%s: client debug mask has been modified to"
+ ":%s: :%llx:\n",
+ __func__,
+ client_debug_string,
+ llu(mask_info.mask_value));
+ } else {
+ gossip_lerr("Invalid mask type....\n");
+ return -EINVAL;
+ }
+
+ return ret;
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return -ENOIOCTLCMD;
+}
+
+static long orangefs_devreq_ioctl(struct file *file,
+ unsigned int command, unsigned long arg)
+{
+ long ret;
+
+ /* Check for properly constructed commands */
+ ret = check_ioctl_command(command);
+ if (ret < 0)
+ return (int)ret;
+
+ return (int)dispatch_ioctl_command(command, arg);
+}
+
+#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */
+
+/* Compat structure for the ORANGEFS_DEV_MAP ioctl */
+struct ORANGEFS_dev_map_desc32 {
+ compat_uptr_t ptr;
+ __s32 total_size;
+ __s32 size;
+ __s32 count;
+};
+
+static unsigned long translate_dev_map26(unsigned long args, long *error)
+{
+ struct ORANGEFS_dev_map_desc32 __user *p32 = (void __user *)args;
+ /*
+ * Depending on the architecture, allocate some space on the
+ * user-call-stack based on our expected layout.
+ */
+ struct ORANGEFS_dev_map_desc __user *p =
+ compat_alloc_user_space(sizeof(*p));
+ compat_uptr_t addr;
+
+ *error = 0;
+ /* get the ptr from the 32 bit user-space */
+ if (get_user(addr, &p32->ptr))
+ goto err;
+ /* try to put that into a 64-bit layout */
+ if (put_user(compat_ptr(addr), &p->ptr))
+ goto err;
+ /* copy the remaining fields */
+ if (copy_in_user(&p->total_size, &p32->total_size, sizeof(__s32)))
+ goto err;
+ if (copy_in_user(&p->size, &p32->size, sizeof(__s32)))
+ goto err;
+ if (copy_in_user(&p->count, &p32->count, sizeof(__s32)))
+ goto err;
+ return (unsigned long)p;
+err:
+ *error = -EFAULT;
+ return 0;
+}
+
+/*
+ * 32 bit user-space apps' ioctl handlers when kernel modules
+ * is compiled as a 64 bit one
+ */
+static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long args)
+{
+ long ret;
+ unsigned long arg = args;
+
+ /* Check for properly constructed commands */
+ ret = check_ioctl_command(cmd);
+ if (ret < 0)
+ return ret;
+ if (cmd == ORANGEFS_DEV_MAP) {
+ /*
+ * convert the arguments to what we expect internally
+ * in kernel space
+ */
+ arg = translate_dev_map26(args, &ret);
+ if (ret < 0) {
+ gossip_err("Could not translate dev map\n");
+ return ret;
+ }
+ }
+ /* no other ioctl requires translation */
+ return dispatch_ioctl_command(cmd, arg);
+}
+
+#endif /* CONFIG_COMPAT is in .config */
+
+/* the assigned character device major number */
+static int orangefs_dev_major;
+
+/*
+ * Initialize orangefs device specific state:
+ * Must be called at module load time only
+ */
+int orangefs_dev_init(void)
+{
+ /* register orangefs-req device */
+ orangefs_dev_major = register_chrdev(0,
+ ORANGEFS_REQDEVICE_NAME,
+ &orangefs_devreq_file_operations);
+ if (orangefs_dev_major < 0) {
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "Failed to register /dev/%s (error %d)\n",
+ ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
+ return orangefs_dev_major;
+ }
+
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "*** /dev/%s character device registered ***\n",
+ ORANGEFS_REQDEVICE_NAME);
+ gossip_debug(GOSSIP_DEV_DEBUG, "'mknod /dev/%s c %d 0'.\n",
+ ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
+ return 0;
+}
+
+void orangefs_dev_cleanup(void)
+{
+ unregister_chrdev(orangefs_dev_major, ORANGEFS_REQDEVICE_NAME);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "*** /dev/%s character device unregistered ***\n",
+ ORANGEFS_REQDEVICE_NAME);
+}
+
+static unsigned int orangefs_devreq_poll(struct file *file,
+ struct poll_table_struct *poll_table)
+{
+ int poll_revent_mask = 0;
+
+ poll_wait(file, &orangefs_request_list_waitq, poll_table);
+
+ if (!list_empty(&orangefs_request_list))
+ poll_revent_mask |= POLL_IN;
+ return poll_revent_mask;
+}
+
+const struct file_operations orangefs_devreq_file_operations = {
+ .owner = THIS_MODULE,
+ .read = orangefs_devreq_read,
+ .write_iter = orangefs_devreq_write_iter,
+ .open = orangefs_devreq_open,
+ .release = orangefs_devreq_release,
+ .unlocked_ioctl = orangefs_devreq_ioctl,
+
+#ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */
+ .compat_ioctl = orangefs_devreq_compat_ioctl,
+#endif
+ .poll = orangefs_devreq_poll
+};
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
new file mode 100644
index 000000000000..324f0af40d7b
--- /dev/null
+++ b/fs/orangefs/dir.c
@@ -0,0 +1,396 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+/*
+ * decode routine used by kmod to deal with the blob sent from
+ * userspace for readdirs. The blob contains zero or more of these
+ * sub-blobs:
+ * __u32 - represents length of the character string that follows.
+ * string - between 1 and ORANGEFS_NAME_MAX bytes long.
+ * padding - (if needed) to cause the __u32 plus the string to be
+ * eight byte aligned.
+ * khandle - sizeof(khandle) bytes.
+ */
+static long decode_dirents(char *ptr, size_t size,
+ struct orangefs_readdir_response_s *readdir)
+{
+ int i;
+ struct orangefs_readdir_response_s *rd =
+ (struct orangefs_readdir_response_s *) ptr;
+ char *buf = ptr;
+ int khandle_size = sizeof(struct orangefs_khandle);
+ size_t offset = offsetof(struct orangefs_readdir_response_s,
+ dirent_array);
+ /* 8 reflects eight byte alignment */
+ int smallest_blob = khandle_size + 8;
+ __u32 len;
+ int aligned_len;
+ int sizeof_u32 = sizeof(__u32);
+ long ret;
+
+ gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size);
+
+ /* size is = offset on empty dirs, > offset on non-empty dirs... */
+ if (size < offset) {
+ gossip_err("%s: size:%zu: offset:%zu:\n",
+ __func__,
+ size,
+ offset);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) {
+ gossip_err("%s: size:%zu: dirent_outcount:%d:\n",
+ __func__,
+ size,
+ readdir->orangefs_dirent_outcount);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ readdir->token = rd->token;
+ readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount;
+ readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount,
+ sizeof(*readdir->dirent_array),
+ GFP_KERNEL);
+ if (readdir->dirent_array == NULL) {
+ gossip_err("%s: kcalloc failed.\n", __func__);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ buf += offset;
+ size -= offset;
+
+ for (i = 0; i < readdir->orangefs_dirent_outcount; i++) {
+ if (size < smallest_blob) {
+ gossip_err("%s: size:%zu: smallest_blob:%d:\n",
+ __func__,
+ size,
+ smallest_blob);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ len = *(__u32 *)buf;
+ if ((len < 1) || (len > ORANGEFS_NAME_MAX)) {
+ gossip_err("%s: len:%d:\n", __func__, len);
+ ret = -EINVAL;
+ goto free;
+ }
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: size:%zu: len:%d:\n",
+ __func__,
+ size,
+ len);
+
+ readdir->dirent_array[i].d_name = buf + sizeof_u32;
+ readdir->dirent_array[i].d_length = len;
+
+ /*
+ * Calculate "aligned" length of this string and its
+ * associated __u32 descriptor.
+ */
+ aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7;
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: aligned_len:%d:\n",
+ __func__,
+ aligned_len);
+
+ /*
+ * The end of the blob should coincide with the end
+ * of the last sub-blob.
+ */
+ if (size < aligned_len + khandle_size) {
+ gossip_err("%s: ran off the end of the blob.\n",
+ __func__);
+ ret = -EINVAL;
+ goto free;
+ }
+ size -= aligned_len + khandle_size;
+
+ buf += aligned_len;
+
+ readdir->dirent_array[i].khandle =
+ *(struct orangefs_khandle *) buf;
+ buf += khandle_size;
+ }
+ ret = buf - ptr;
+ gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret);
+ goto out;
+
+free:
+ kfree(readdir->dirent_array);
+ readdir->dirent_array = NULL;
+
+out:
+ return ret;
+}
+
+/*
+ * Read directory entries from an instance of an open directory.
+ */
+static int orangefs_readdir(struct file *file, struct dir_context *ctx)
+{
+ int ret = 0;
+ int buffer_index;
+ /*
+ * ptoken supports Orangefs' distributed directory logic, added
+ * in 2.9.2.
+ */
+ __u64 *ptoken = file->private_data;
+ __u64 pos = 0;
+ ino_t ino = 0;
+ struct dentry *dentry = file->f_path.dentry;
+ struct orangefs_kernel_op_s *new_op = NULL;
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
+ struct orangefs_readdir_response_s readdir_response;
+ void *dents_buf;
+ int i = 0;
+ int len = 0;
+ ino_t current_ino = 0;
+ char *current_entry = NULL;
+ long bytes_decoded;
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: ctx->pos:%lld, ptoken = %llu\n",
+ __func__,
+ lld(ctx->pos),
+ llu(*ptoken));
+
+ pos = (__u64) ctx->pos;
+
+ /* are we done? */
+ if (pos == ORANGEFS_READDIR_END) {
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "Skipping to termination path\n");
+ return 0;
+ }
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "orangefs_readdir called on %s (pos=%llu)\n",
+ dentry->d_name.name, llu(pos));
+
+ memset(&readdir_response, 0, sizeof(readdir_response));
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+ if (!new_op)
+ return -ENOMEM;
+
+ /*
+ * Only the indices are shared. No memory is actually shared, but the
+ * mechanism is used.
+ */
+ new_op->uses_shared_memory = 1;
+ new_op->upcall.req.readdir.refn = orangefs_inode->refn;
+ new_op->upcall.req.readdir.max_dirent_count =
+ ORANGEFS_MAX_DIRENT_COUNT_READDIR;
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: upcall.req.readdir.refn.khandle: %pU\n",
+ __func__,
+ &new_op->upcall.req.readdir.refn.khandle);
+
+ new_op->upcall.req.readdir.token = *ptoken;
+
+get_new_buffer_index:
+ buffer_index = orangefs_readdir_index_get();
+ if (buffer_index < 0) {
+ ret = buffer_index;
+ gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n",
+ ret);
+ goto out_free_op;
+ }
+ new_op->upcall.req.readdir.buf_index = buffer_index;
+
+ ret = service_operation(new_op,
+ "orangefs_readdir",
+ get_interruptible_flag(dentry->d_inode));
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "Readdir downcall status is %d. ret:%d\n",
+ new_op->downcall.status,
+ ret);
+
+ orangefs_readdir_index_put(buffer_index);
+
+ if (ret == -EAGAIN && op_state_purged(new_op)) {
+ /* Client-core indices are invalid after it restarted. */
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: Getting new buffer_index for retry of readdir..\n",
+ __func__);
+ goto get_new_buffer_index;
+ }
+
+ if (ret == -EIO && op_state_purged(new_op)) {
+ gossip_err("%s: Client is down. Aborting readdir call.\n",
+ __func__);
+ goto out_free_op;
+ }
+
+ if (ret < 0 || new_op->downcall.status != 0) {
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "Readdir request failed. Status:%d\n",
+ new_op->downcall.status);
+ if (ret >= 0)
+ ret = new_op->downcall.status;
+ goto out_free_op;
+ }
+
+ dents_buf = new_op->downcall.trailer_buf;
+ if (dents_buf == NULL) {
+ gossip_err("Invalid NULL buffer in readdir response\n");
+ ret = -ENOMEM;
+ goto out_free_op;
+ }
+
+ bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
+ &readdir_response);
+ if (bytes_decoded < 0) {
+ ret = bytes_decoded;
+ gossip_err("Could not decode readdir from buffer %d\n", ret);
+ goto out_vfree;
+ }
+
+ if (bytes_decoded != new_op->downcall.trailer_size) {
+ gossip_err("orangefs_readdir: # bytes decoded (%ld) "
+ "!= trailer size (%ld)\n",
+ bytes_decoded,
+ (long)new_op->downcall.trailer_size);
+ ret = -EINVAL;
+ goto out_destroy_handle;
+ }
+
+ /*
+ * orangefs doesn't actually store dot and dot-dot, but
+ * we need to have them represented.
+ */
+ if (pos == 0) {
+ ino = get_ino_from_khandle(dentry->d_inode);
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: calling dir_emit of \".\" with pos = %llu\n",
+ __func__,
+ llu(pos));
+ ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
+ pos += 1;
+ }
+
+ if (pos == 1) {
+ ino = get_parent_ino_from_dentry(dentry);
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: calling dir_emit of \"..\" with pos = %llu\n",
+ __func__,
+ llu(pos));
+ ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
+ pos += 1;
+ }
+
+ /*
+ * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around
+ * to prevent "finding" dot and dot-dot on any iteration
+ * other than the first.
+ */
+ if (ctx->pos == ORANGEFS_ITERATE_NEXT)
+ ctx->pos = 0;
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: dirent_outcount:%d:\n",
+ __func__,
+ readdir_response.orangefs_dirent_outcount);
+ for (i = ctx->pos;
+ i < readdir_response.orangefs_dirent_outcount;
+ i++) {
+ len = readdir_response.dirent_array[i].d_length;
+ current_entry = readdir_response.dirent_array[i].d_name;
+ current_ino = orangefs_khandle_to_ino(
+ &readdir_response.dirent_array[i].khandle);
+
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "calling dir_emit for %s with len %d"
+ ", ctx->pos %ld\n",
+ current_entry,
+ len,
+ (unsigned long)ctx->pos);
+ /*
+ * type is unknown. We don't return object type
+ * in the dirent_array. This leaves getdents
+ * clueless about type.
+ */
+ ret =
+ dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
+ if (!ret)
+ break;
+ ctx->pos++;
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "%s: ctx->pos:%lld\n",
+ __func__,
+ lld(ctx->pos));
+
+ }
+
+ /*
+ * we ran all the way through the last batch, set up for
+ * getting another batch...
+ */
+ if (ret) {
+ *ptoken = readdir_response.token;
+ ctx->pos = ORANGEFS_ITERATE_NEXT;
+ }
+
+ /*
+ * Did we hit the end of the directory?
+ */
+ if (readdir_response.token == ORANGEFS_READDIR_END) {
+ gossip_debug(GOSSIP_DIR_DEBUG,
+ "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
+ ctx->pos = ORANGEFS_READDIR_END;
+ }
+
+out_destroy_handle:
+ /* kfree(NULL) is safe */
+ kfree(readdir_response.dirent_array);
+out_vfree:
+ gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
+ vfree(dents_buf);
+out_free_op:
+ op_release(new_op);
+ gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
+ return ret;
+}
+
+static int orangefs_dir_open(struct inode *inode, struct file *file)
+{
+ __u64 *ptoken;
+
+ file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
+ if (!file->private_data)
+ return -ENOMEM;
+
+ ptoken = file->private_data;
+ *ptoken = ORANGEFS_READDIR_START;
+ return 0;
+}
+
+static int orangefs_dir_release(struct inode *inode, struct file *file)
+{
+ orangefs_flush_inode(inode);
+ kfree(file->private_data);
+ return 0;
+}
+
+/** ORANGEFS implementation of VFS directory operations */
+const struct file_operations orangefs_dir_operations = {
+ .read = generic_read_dir,
+ .iterate = orangefs_readdir,
+ .open = orangefs_dir_open,
+ .release = orangefs_dir_release,
+};
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
new file mode 100644
index 000000000000..66b99210f1f9
--- /dev/null
+++ b/fs/orangefs/downcall.h
@@ -0,0 +1,133 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Definitions of downcalls used in Linux kernel module.
+ */
+
+#ifndef __DOWNCALL_H
+#define __DOWNCALL_H
+
+/*
+ * Sanitized the device-client core interaction
+ * for clean 32-64 bit usage
+ */
+struct orangefs_io_response {
+ __s64 amt_complete;
+};
+
+struct orangefs_lookup_response {
+ struct orangefs_object_kref refn;
+};
+
+struct orangefs_create_response {
+ struct orangefs_object_kref refn;
+};
+
+struct orangefs_symlink_response {
+ struct orangefs_object_kref refn;
+};
+
+struct orangefs_getattr_response {
+ struct ORANGEFS_sys_attr_s attributes;
+ char link_target[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_mkdir_response {
+ struct orangefs_object_kref refn;
+};
+
+/*
+ * duplication of some system interface structures so that I don't have
+ * to allocate extra memory
+ */
+struct orangefs_dirent {
+ char *d_name;
+ int d_length;
+ struct orangefs_khandle khandle;
+};
+
+struct orangefs_statfs_response {
+ __s64 block_size;
+ __s64 blocks_total;
+ __s64 blocks_avail;
+ __s64 files_total;
+ __s64 files_avail;
+};
+
+struct orangefs_fs_mount_response {
+ __s32 fs_id;
+ __s32 id;
+ struct orangefs_khandle root_khandle;
+};
+
+/* the getxattr response is the attribute value */
+struct orangefs_getxattr_response {
+ __s32 val_sz;
+ __s32 __pad1;
+ char val[ORANGEFS_MAX_XATTR_VALUELEN];
+};
+
+/* the listxattr response is an array of attribute names */
+struct orangefs_listxattr_response {
+ __s32 returned_count;
+ __s32 __pad1;
+ __u64 token;
+ char key[ORANGEFS_MAX_XATTR_LISTLEN * ORANGEFS_MAX_XATTR_NAMELEN];
+ __s32 keylen;
+ __s32 __pad2;
+ __s32 lengths[ORANGEFS_MAX_XATTR_LISTLEN];
+};
+
+struct orangefs_param_response {
+ __s64 value;
+};
+
+#define PERF_COUNT_BUF_SIZE 4096
+struct orangefs_perf_count_response {
+ char buffer[PERF_COUNT_BUF_SIZE];
+};
+
+#define FS_KEY_BUF_SIZE 4096
+struct orangefs_fs_key_response {
+ __s32 fs_keylen;
+ __s32 __pad1;
+ char fs_key[FS_KEY_BUF_SIZE];
+};
+
+struct orangefs_downcall_s {
+ __s32 type;
+ __s32 status;
+ /* currently trailer is used only by readdir */
+ __s64 trailer_size;
+ char *trailer_buf;
+
+ union {
+ struct orangefs_io_response io;
+ struct orangefs_lookup_response lookup;
+ struct orangefs_create_response create;
+ struct orangefs_symlink_response sym;
+ struct orangefs_getattr_response getattr;
+ struct orangefs_mkdir_response mkdir;
+ struct orangefs_statfs_response statfs;
+ struct orangefs_fs_mount_response fs_mount;
+ struct orangefs_getxattr_response getxattr;
+ struct orangefs_listxattr_response listxattr;
+ struct orangefs_param_response param;
+ struct orangefs_perf_count_response perf_count;
+ struct orangefs_fs_key_response fs_key;
+ } resp;
+};
+
+struct orangefs_readdir_response_s {
+ __u64 token;
+ __u64 directory_version;
+ __u32 __pad2;
+ __u32 orangefs_dirent_outcount;
+ struct orangefs_dirent *dirent_array;
+};
+
+#endif /* __DOWNCALL_H */
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
new file mode 100644
index 000000000000..ae92795ed965
--- /dev/null
+++ b/fs/orangefs/file.c
@@ -0,0 +1,717 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Linux VFS file operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+
+/*
+ * Copy to client-core's address space from the buffers specified
+ * by the iovec upto total_size bytes.
+ * NOTE: the iovector can either contain addresses which
+ * can futher be kernel-space or user-space addresses.
+ * or it can pointers to struct page's
+ */
+static int precopy_buffers(int buffer_index,
+ struct iov_iter *iter,
+ size_t total_size)
+{
+ int ret = 0;
+ /*
+ * copy data from application/kernel by pulling it out
+ * of the iovec.
+ */
+
+
+ if (total_size) {
+ ret = orangefs_bufmap_copy_from_iovec(iter,
+ buffer_index,
+ total_size);
+ if (ret < 0)
+ gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
+ __func__,
+ (long)ret);
+ }
+
+ if (ret < 0)
+ gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
+ __func__,
+ (long)ret);
+ return ret;
+}
+
+/*
+ * Copy from client-core's address space to the buffers specified
+ * by the iovec upto total_size bytes.
+ * NOTE: the iovector can either contain addresses which
+ * can futher be kernel-space or user-space addresses.
+ * or it can pointers to struct page's
+ */
+static int postcopy_buffers(int buffer_index,
+ struct iov_iter *iter,
+ size_t total_size)
+{
+ int ret = 0;
+ /*
+ * copy data to application/kernel by pushing it out to
+ * the iovec. NOTE; target buffers can be addresses or
+ * struct page pointers.
+ */
+ if (total_size) {
+ ret = orangefs_bufmap_copy_to_iovec(iter,
+ buffer_index,
+ total_size);
+ if (ret < 0)
+ gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
+ __func__,
+ (long)ret);
+ }
+ return ret;
+}
+
+/*
+ * Post and wait for the I/O upcall to finish
+ */
+static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
+ loff_t *offset, struct iov_iter *iter,
+ size_t total_size, loff_t readahead_size)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
+ struct orangefs_kernel_op_s *new_op = NULL;
+ struct iov_iter saved = *iter;
+ int buffer_index = -1;
+ ssize_t ret;
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
+ if (!new_op)
+ return -ENOMEM;
+
+ /* synchronous I/O */
+ new_op->upcall.req.io.readahead_size = readahead_size;
+ new_op->upcall.req.io.io_type = type;
+ new_op->upcall.req.io.refn = orangefs_inode->refn;
+
+populate_shared_memory:
+ /* get a shared buffer index */
+ buffer_index = orangefs_bufmap_get();
+ if (buffer_index < 0) {
+ ret = buffer_index;
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s: orangefs_bufmap_get failure (%zd)\n",
+ __func__, ret);
+ goto out;
+ }
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): GET op %p -> buffer_index %d\n",
+ __func__,
+ handle,
+ new_op,
+ buffer_index);
+
+ new_op->uses_shared_memory = 1;
+ new_op->upcall.req.io.buf_index = buffer_index;
+ new_op->upcall.req.io.count = total_size;
+ new_op->upcall.req.io.offset = *offset;
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): offset: %llu total_size: %zd\n",
+ __func__,
+ handle,
+ llu(*offset),
+ total_size);
+ /*
+ * Stage 1: copy the buffers into client-core's address space
+ * precopy_buffers only pertains to writes.
+ */
+ if (type == ORANGEFS_IO_WRITE) {
+ ret = precopy_buffers(buffer_index,
+ iter,
+ total_size);
+ if (ret < 0)
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): Calling post_io_request with tag (%llu)\n",
+ __func__,
+ handle,
+ llu(new_op->tag));
+
+ /* Stage 2: Service the I/O operation */
+ ret = service_operation(new_op,
+ type == ORANGEFS_IO_WRITE ?
+ "file_write" :
+ "file_read",
+ get_interruptible_flag(inode));
+
+ /*
+ * If service_operation() returns -EAGAIN #and# the operation was
+ * purged from orangefs_request_list or htable_ops_in_progress, then
+ * we know that the client was restarted, causing the shared memory
+ * area to be wiped clean. To restart a write operation in this
+ * case, we must re-copy the data from the user's iovec to a NEW
+ * shared memory location. To restart a read operation, we must get
+ * a new shared memory location.
+ */
+ if (ret == -EAGAIN && op_state_purged(new_op)) {
+ orangefs_bufmap_put(buffer_index);
+ buffer_index = -1;
+ if (type == ORANGEFS_IO_WRITE)
+ *iter = saved;
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s:going to repopulate_shared_memory.\n",
+ __func__);
+ goto populate_shared_memory;
+ }
+
+ if (ret < 0) {
+ if (ret == -EINTR) {
+ /*
+ * We can't return EINTR if any data was written,
+ * it's not POSIX. It is minimally acceptable
+ * to give a partial write, the way NFS does.
+ *
+ * It would be optimal to return all or nothing,
+ * but if a userspace write is bigger than
+ * an IO buffer, and the interrupt occurs
+ * between buffer writes, that would not be
+ * possible.
+ */
+ switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) {
+ /*
+ * If the op was waiting when the interrupt
+ * occurred, then the client-core did not
+ * trigger the write.
+ */
+ case OP_VFS_STATE_WAITING:
+ if (*offset == 0)
+ ret = -EINTR;
+ else
+ ret = 0;
+ break;
+ /*
+ * If the op was in progress when the interrupt
+ * occurred, then the client-core was able to
+ * trigger the write.
+ */
+ case OP_VFS_STATE_INPROGR:
+ ret = total_size;
+ break;
+ default:
+ gossip_err("%s: unexpected op state :%d:.\n",
+ __func__,
+ new_op->op_state);
+ ret = 0;
+ break;
+ }
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s: got EINTR, state:%d: %p\n",
+ __func__,
+ new_op->op_state,
+ new_op);
+ } else {
+ gossip_err("%s: error in %s handle %pU, returning %zd\n",
+ __func__,
+ type == ORANGEFS_IO_READ ?
+ "read from" : "write to",
+ handle, ret);
+ }
+ if (orangefs_cancel_op_in_progress(new_op))
+ return ret;
+
+ goto out;
+ }
+
+ /*
+ * Stage 3: Post copy buffers from client-core's address space
+ * postcopy_buffers only pertains to reads.
+ */
+ if (type == ORANGEFS_IO_READ) {
+ ret = postcopy_buffers(buffer_index,
+ iter,
+ new_op->downcall.resp.io.amt_complete);
+ if (ret < 0)
+ goto out;
+ }
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): Amount %s, returned by the sys-io call:%d\n",
+ __func__,
+ handle,
+ type == ORANGEFS_IO_READ ? "read" : "written",
+ (int)new_op->downcall.resp.io.amt_complete);
+
+ ret = new_op->downcall.resp.io.amt_complete;
+
+out:
+ if (buffer_index >= 0) {
+ orangefs_bufmap_put(buffer_index);
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): PUT buffer_index %d\n",
+ __func__, handle, buffer_index);
+ buffer_index = -1;
+ }
+ op_release(new_op);
+ return ret;
+}
+
+/*
+ * Common entry point for read/write/readv/writev
+ * This function will dispatch it to either the direct I/O
+ * or buffered I/O path depending on the mount options and/or
+ * augmented/extended metadata attached to the file.
+ * Note: File extended attributes override any mount options.
+ */
+static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file,
+ loff_t *offset, struct iov_iter *iter)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
+ size_t count = iov_iter_count(iter);
+ ssize_t total_count = 0;
+ ssize_t ret = -EINVAL;
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
+ __func__,
+ handle,
+ (int)count);
+
+ if (type == ORANGEFS_IO_WRITE) {
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): proceeding with offset : %llu, "
+ "size %d\n",
+ __func__,
+ handle,
+ llu(*offset),
+ (int)count);
+ }
+
+ if (count == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ while (iov_iter_count(iter)) {
+ size_t each_count = iov_iter_count(iter);
+ size_t amt_complete;
+
+ /* how much to transfer in this loop iteration */
+ if (each_count > orangefs_bufmap_size_query())
+ each_count = orangefs_bufmap_size_query();
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): size of each_count(%d)\n",
+ __func__,
+ handle,
+ (int)each_count);
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): BEFORE wait_for_io: offset is %d\n",
+ __func__,
+ handle,
+ (int)*offset);
+
+ ret = wait_for_direct_io(type, inode, offset, iter,
+ each_count, 0);
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): return from wait_for_io:%d\n",
+ __func__,
+ handle,
+ (int)ret);
+
+ if (ret < 0)
+ goto out;
+
+ *offset += ret;
+ total_count += ret;
+ amt_complete = ret;
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): AFTER wait_for_io: offset is %d\n",
+ __func__,
+ handle,
+ (int)*offset);
+
+ /*
+ * if we got a short I/O operations,
+ * fall out and return what we got so far
+ */
+ if (amt_complete < each_count)
+ break;
+ } /*end while */
+
+out:
+ if (total_count > 0)
+ ret = total_count;
+ if (ret > 0) {
+ if (type == ORANGEFS_IO_READ) {
+ file_accessed(file);
+ } else {
+ SetMtimeFlag(orangefs_inode);
+ inode->i_mtime = CURRENT_TIME;
+ mark_inode_dirty_sync(inode);
+ }
+ }
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): Value(%d) returned.\n",
+ __func__,
+ handle,
+ (int)ret);
+
+ return ret;
+}
+
+/*
+ * Read data from a specified offset in a file (referenced by inode).
+ * Data may be placed either in a user or kernel buffer.
+ */
+ssize_t orangefs_inode_read(struct inode *inode,
+ struct iov_iter *iter,
+ loff_t *offset,
+ loff_t readahead_size)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ size_t count = iov_iter_count(iter);
+ size_t bufmap_size;
+ ssize_t ret = -EINVAL;
+
+ g_orangefs_stats.reads++;
+
+ bufmap_size = orangefs_bufmap_size_query();
+ if (count > bufmap_size) {
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s: count is too large (%zd/%zd)!\n",
+ __func__, count, bufmap_size);
+ return -EINVAL;
+ }
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU) %zd@%llu\n",
+ __func__,
+ &orangefs_inode->refn.khandle,
+ count,
+ llu(*offset));
+
+ ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter,
+ count, readahead_size);
+ if (ret > 0)
+ *offset += ret;
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s(%pU): Value(%zd) returned.\n",
+ __func__,
+ &orangefs_inode->refn.khandle,
+ ret);
+
+ return ret;
+}
+
+static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ loff_t pos = *(&iocb->ki_pos);
+ ssize_t rc = 0;
+
+ BUG_ON(iocb->private);
+
+ gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n");
+
+ g_orangefs_stats.reads++;
+
+ rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
+ iocb->ki_pos = pos;
+
+ return rc;
+}
+
+static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ loff_t pos;
+ ssize_t rc;
+
+ BUG_ON(iocb->private);
+
+ gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
+
+ mutex_lock(&file->f_mapping->host->i_mutex);
+
+ /* Make sure generic_write_checks sees an up to date inode size. */
+ if (file->f_flags & O_APPEND) {
+ rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+ if (rc == -ESTALE)
+ rc = -EIO;
+ if (rc) {
+ gossip_err("%s: orangefs_inode_getattr failed, "
+ "rc:%zd:.\n", __func__, rc);
+ goto out;
+ }
+ }
+
+ if (file->f_pos > i_size_read(file->f_mapping->host))
+ orangefs_i_size_write(file->f_mapping->host, file->f_pos);
+
+ rc = generic_write_checks(iocb, iter);
+
+ if (rc <= 0) {
+ gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
+ __func__, rc);
+ goto out;
+ }
+
+ /*
+ * if we are appending, generic_write_checks would have updated
+ * pos to the end of the file, so we will wait till now to set
+ * pos...
+ */
+ pos = *(&iocb->ki_pos);
+
+ rc = do_readv_writev(ORANGEFS_IO_WRITE,
+ file,
+ &pos,
+ iter);
+ if (rc < 0) {
+ gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
+ __func__, rc);
+ goto out;
+ }
+
+ iocb->ki_pos = pos;
+ g_orangefs_stats.writes++;
+
+out:
+
+ mutex_unlock(&file->f_mapping->host->i_mutex);
+ return rc;
+}
+
+/*
+ * Perform a miscellaneous operation on a file.
+ */
+static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int ret = -ENOTTY;
+ __u64 val = 0;
+ unsigned long uval;
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_ioctl: called with cmd %d\n",
+ cmd);
+
+ /*
+ * we understand some general ioctls on files, such as the immutable
+ * and append flags
+ */
+ if (cmd == FS_IOC_GETFLAGS) {
+ val = 0;
+ ret = orangefs_inode_getxattr(file_inode(file),
+ ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ "user.pvfs2.meta_hint",
+ &val, sizeof(val));
+ if (ret < 0 && ret != -ENODATA)
+ return ret;
+ else if (ret == -ENODATA)
+ val = 0;
+ uval = val;
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n",
+ (unsigned long long)uval);
+ return put_user(uval, (int __user *)arg);
+ } else if (cmd == FS_IOC_SETFLAGS) {
+ ret = 0;
+ if (get_user(uval, (int __user *)arg))
+ return -EFAULT;
+ /*
+ * ORANGEFS_MIRROR_FL is set internally when the mirroring mode
+ * is turned on for a file. The user is not allowed to turn
+ * on this bit, but the bit is present if the user first gets
+ * the flags and then updates the flags with some new
+ * settings. So, we ignore it in the following edit. bligon.
+ */
+ if ((uval & ~ORANGEFS_MIRROR_FL) &
+ (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
+ gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
+ return -EINVAL;
+ }
+ val = uval;
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n",
+ (unsigned long long)val);
+ ret = orangefs_inode_setxattr(file_inode(file),
+ ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ "user.pvfs2.meta_hint",
+ &val, sizeof(val), 0);
+ }
+
+ return ret;
+}
+
+/*
+ * Memory map a region of a file.
+ */
+static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_file_mmap: called on %s\n",
+ (file ?
+ (char *)file->f_path.dentry->d_name.name :
+ (char *)"Unknown"));
+
+ /* set the sequential readahead hint */
+ vma->vm_flags |= VM_SEQ_READ;
+ vma->vm_flags &= ~VM_RAND_READ;
+
+ /* Use readonly mmap since we cannot support writable maps. */
+ return generic_file_readonly_mmap(file, vma);
+}
+
+#define mapping_nrpages(idata) ((idata)->nrpages)
+
+/*
+ * Called to notify the module that there are no more references to
+ * this file (i.e. no processes have it open).
+ *
+ * \note Not called when each file is closed.
+ */
+static int orangefs_file_release(struct inode *inode, struct file *file)
+{
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_file_release: called on %s\n",
+ file->f_path.dentry->d_name.name);
+
+ orangefs_flush_inode(inode);
+
+ /*
+ * remove all associated inode pages from the page cache and mmap
+ * readahead cache (if any); this forces an expensive refresh of
+ * data for the next caller of mmap (or 'get_block' accesses)
+ */
+ if (file->f_path.dentry->d_inode &&
+ file->f_path.dentry->d_inode->i_mapping &&
+ mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
+ truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
+ 0);
+ return 0;
+}
+
+/*
+ * Push all data for a specific file onto permanent storage.
+ */
+static int orangefs_fsync(struct file *file,
+ loff_t start,
+ loff_t end,
+ int datasync)
+{
+ int ret = -EINVAL;
+ struct orangefs_inode_s *orangefs_inode =
+ ORANGEFS_I(file->f_path.dentry->d_inode);
+ struct orangefs_kernel_op_s *new_op = NULL;
+
+ /* required call */
+ filemap_write_and_wait_range(file->f_mapping, start, end);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.fsync.refn = orangefs_inode->refn;
+
+ ret = service_operation(new_op,
+ "orangefs_fsync",
+ get_interruptible_flag(file->f_path.dentry->d_inode));
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_fsync got return value of %d\n",
+ ret);
+
+ op_release(new_op);
+
+ orangefs_flush_inode(file->f_path.dentry->d_inode);
+ return ret;
+}
+
+/*
+ * Change the file pointer position for an instance of an open file.
+ *
+ * \note If .llseek is overriden, we must acquire lock as described in
+ * Documentation/filesystems/Locking.
+ *
+ * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
+ * require much changes to the FS
+ */
+static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
+{
+ int ret = -EINVAL;
+ struct inode *inode = file_inode(file);
+
+ if (origin == SEEK_END) {
+ /*
+ * revalidate the inode's file size.
+ * NOTE: We are only interested in file size here,
+ * so we set mask accordingly.
+ */
+ ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+ if (ret == -ESTALE)
+ ret = -EIO;
+ if (ret) {
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "%s:%s:%d calling make bad inode\n",
+ __FILE__,
+ __func__,
+ __LINE__);
+ return ret;
+ }
+ }
+
+ gossip_debug(GOSSIP_FILE_DEBUG,
+ "orangefs_file_llseek: offset is %ld | origin is %d"
+ " | inode size is %lu\n",
+ (long)offset,
+ origin,
+ (unsigned long)i_size_read(inode));
+
+ return generic_file_llseek(file, offset, origin);
+}
+
+/*
+ * Support local locks (locks that only this kernel knows about)
+ * if Orangefs was mounted -o local_lock.
+ */
+static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
+{
+ int rc = -EINVAL;
+
+ if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
+ if (cmd == F_GETLK) {
+ rc = 0;
+ posix_test_lock(filp, fl);
+ } else {
+ rc = posix_lock_file(filp, fl, NULL);
+ }
+ }
+
+ return rc;
+}
+
+/** ORANGEFS implementation of VFS file operations */
+const struct file_operations orangefs_file_operations = {
+ .llseek = orangefs_file_llseek,
+ .read_iter = orangefs_file_read_iter,
+ .write_iter = orangefs_file_write_iter,
+ .lock = orangefs_lock,
+ .unlocked_ioctl = orangefs_ioctl,
+ .mmap = orangefs_file_mmap,
+ .open = generic_file_open,
+ .release = orangefs_file_release,
+ .fsync = orangefs_fsync,
+};
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
new file mode 100644
index 000000000000..85640e955cde
--- /dev/null
+++ b/fs/orangefs/inode.c
@@ -0,0 +1,461 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Linux VFS inode operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+static int read_one_page(struct page *page)
+{
+ int ret;
+ int max_block;
+ ssize_t bytes_read = 0;
+ struct inode *inode = page->mapping->host;
+ const __u32 blocksize = PAGE_SIZE; /* inode->i_blksize */
+ const __u32 blockbits = PAGE_SHIFT; /* inode->i_blkbits */
+ struct iov_iter to;
+ struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
+
+ iov_iter_bvec(&to, ITER_BVEC | READ, &bv, 1, PAGE_SIZE);
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_readpage called with page %p\n",
+ page);
+
+ max_block = ((inode->i_size / blocksize) + 1);
+
+ if (page->index < max_block) {
+ loff_t blockptr_offset = (((loff_t) page->index) << blockbits);
+
+ bytes_read = orangefs_inode_read(inode,
+ &to,
+ &blockptr_offset,
+ inode->i_size);
+ }
+ /* this will only zero remaining unread portions of the page data */
+ iov_iter_zero(~0U, &to);
+ /* takes care of potential aliasing */
+ flush_dcache_page(page);
+ if (bytes_read < 0) {
+ ret = bytes_read;
+ SetPageError(page);
+ } else {
+ SetPageUptodate(page);
+ if (PageError(page))
+ ClearPageError(page);
+ ret = 0;
+ }
+ /* unlock the page after the ->readpage() routine completes */
+ unlock_page(page);
+ return ret;
+}
+
+static int orangefs_readpage(struct file *file, struct page *page)
+{
+ return read_one_page(page);
+}
+
+static int orangefs_readpages(struct file *file,
+ struct address_space *mapping,
+ struct list_head *pages,
+ unsigned nr_pages)
+{
+ int page_idx;
+ int ret;
+
+ gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpages called\n");
+
+ for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ struct page *page;
+
+ page = list_entry(pages->prev, struct page, lru);
+ list_del(&page->lru);
+ if (!add_to_page_cache(page,
+ mapping,
+ page->index,
+ GFP_KERNEL)) {
+ ret = read_one_page(page);
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "failure adding page to cache, read_one_page returned: %d\n",
+ ret);
+ } else {
+ put_page(page);
+ }
+ }
+ BUG_ON(!list_empty(pages));
+ return 0;
+}
+
+static void orangefs_invalidatepage(struct page *page,
+ unsigned int offset,
+ unsigned int length)
+{
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_invalidatepage called on page %p "
+ "(offset is %u)\n",
+ page,
+ offset);
+
+ ClearPageUptodate(page);
+ ClearPageMappedToDisk(page);
+ return;
+
+}
+
+static int orangefs_releasepage(struct page *page, gfp_t foo)
+{
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_releasepage called on page %p\n",
+ page);
+ return 0;
+}
+
+/*
+ * Having a direct_IO entry point in the address_space_operations
+ * struct causes the kernel to allows us to use O_DIRECT on
+ * open. Nothing will ever call this thing, but in the future we
+ * will need to be able to use O_DIRECT on open in order to support
+ * AIO. Modeled after NFS, they do this too.
+ */
+/*
+ * static ssize_t orangefs_direct_IO(int rw,
+ * struct kiocb *iocb,
+ * struct iov_iter *iter,
+ * loff_t offset)
+ *{
+ * gossip_debug(GOSSIP_INODE_DEBUG,
+ * "orangefs_direct_IO: %s\n",
+ * iocb->ki_filp->f_path.dentry->d_name.name);
+ *
+ * return -EINVAL;
+ *}
+ */
+
+struct backing_dev_info orangefs_backing_dev_info = {
+ .name = "orangefs",
+ .ra_pages = 0,
+ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+
+/** ORANGEFS2 implementation of address space operations */
+const struct address_space_operations orangefs_address_operations = {
+ .readpage = orangefs_readpage,
+ .readpages = orangefs_readpages,
+ .invalidatepage = orangefs_invalidatepage,
+ .releasepage = orangefs_releasepage,
+/* .direct_IO = orangefs_direct_IO */
+};
+
+static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ loff_t orig_size;
+ int ret = -EINVAL;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "%s: %pU: Handle is %pU | fs_id %d | size is %llu\n",
+ __func__,
+ get_khandle_from_ino(inode),
+ &orangefs_inode->refn.khandle,
+ orangefs_inode->refn.fs_id,
+ iattr->ia_size);
+
+ /* Ensure that we have a up to date size, so we know if it changed. */
+ ret = orangefs_inode_getattr(inode, 0, 1);
+ if (ret == -ESTALE)
+ ret = -EIO;
+ if (ret) {
+ gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n",
+ __func__, ret);
+ return ret;
+ }
+ orig_size = i_size_read(inode);
+
+ truncate_setsize(inode, iattr->ia_size);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.truncate.refn = orangefs_inode->refn;
+ new_op->upcall.req.truncate.size = (__s64) iattr->ia_size;
+
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+
+ /*
+ * the truncate has no downcall members to retrieve, but
+ * the status value tells us if it went through ok or not
+ */
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs: orangefs_truncate got return value of %d\n",
+ ret);
+
+ op_release(new_op);
+
+ if (ret != 0)
+ return ret;
+
+ if (orig_size != i_size_read(inode))
+ iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+
+ return ret;
+}
+
+/*
+ * Change attributes of an object referenced by dentry.
+ */
+int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ int ret = -EINVAL;
+ struct inode *inode = dentry->d_inode;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_setattr: called on %s\n",
+ dentry->d_name.name);
+
+ ret = inode_change_ok(inode, iattr);
+ if (ret)
+ goto out;
+
+ if ((iattr->ia_valid & ATTR_SIZE) &&
+ iattr->ia_size != i_size_read(inode)) {
+ ret = orangefs_setattr_size(inode, iattr);
+ if (ret)
+ goto out;
+ }
+
+ setattr_copy(inode, iattr);
+ mark_inode_dirty(inode);
+
+ ret = orangefs_inode_setattr(inode, iattr);
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_setattr: inode_setattr returned %d\n",
+ ret);
+
+ if (!ret && (iattr->ia_valid & ATTR_MODE))
+ /* change mod on a file that has ACLs */
+ ret = posix_acl_chmod(inode, inode->i_mode);
+
+out:
+ gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", ret);
+ return ret;
+}
+
+/*
+ * Obtain attributes of an object given a dentry
+ */
+int orangefs_getattr(struct vfsmount *mnt,
+ struct dentry *dentry,
+ struct kstat *kstat)
+{
+ int ret = -ENOENT;
+ struct inode *inode = dentry->d_inode;
+ struct orangefs_inode_s *orangefs_inode = NULL;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "orangefs_getattr: called on %s\n",
+ dentry->d_name.name);
+
+ ret = orangefs_inode_getattr(inode, 0, 1);
+ if (ret == 0) {
+ generic_fillattr(inode, kstat);
+
+ /* override block size reported to stat */
+ orangefs_inode = ORANGEFS_I(inode);
+ kstat->blksize = orangefs_inode->blksize;
+ }
+ return ret;
+}
+
+int orangefs_permission(struct inode *inode, int mask)
+{
+ int ret;
+
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
+
+ /* Make sure the permission (and other common attrs) are up to date. */
+ ret = orangefs_inode_getattr(inode, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ return generic_permission(inode, mask);
+}
+
+/* ORANGEDS2 implementation of VFS inode operations for files */
+struct inode_operations orangefs_file_inode_operations = {
+ .get_acl = orangefs_get_acl,
+ .set_acl = orangefs_set_acl,
+ .setattr = orangefs_setattr,
+ .getattr = orangefs_getattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = orangefs_listxattr,
+ .removexattr = generic_removexattr,
+ .permission = orangefs_permission,
+};
+
+static int orangefs_init_iops(struct inode *inode)
+{
+ inode->i_mapping->a_ops = &orangefs_address_operations;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &orangefs_file_inode_operations;
+ inode->i_fop = &orangefs_file_operations;
+ inode->i_blkbits = PAGE_SHIFT;
+ break;
+ case S_IFLNK:
+ inode->i_op = &orangefs_symlink_inode_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &orangefs_dir_inode_operations;
+ inode->i_fop = &orangefs_dir_operations;
+ break;
+ default:
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "%s: unsupported mode\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Given a ORANGEFS object identifier (fsid, handle), convert it into a ino_t type
+ * that will be used as a hash-index from where the handle will
+ * be searched for in the VFS hash table of inodes.
+ */
+static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref)
+{
+ if (!ref)
+ return 0;
+ return orangefs_khandle_to_ino(&(ref->khandle));
+}
+
+/*
+ * Called to set up an inode from iget5_locked.
+ */
+static int orangefs_set_inode(struct inode *inode, void *data)
+{
+ struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
+ ORANGEFS_I(inode)->refn.fs_id = ref->fs_id;
+ ORANGEFS_I(inode)->refn.khandle = ref->khandle;
+ return 0;
+}
+
+/*
+ * Called to determine if handles match.
+ */
+static int orangefs_test_inode(struct inode *inode, void *data)
+{
+ struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data;
+ struct orangefs_inode_s *orangefs_inode = NULL;
+
+ orangefs_inode = ORANGEFS_I(inode);
+ return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), &(ref->khandle))
+ && orangefs_inode->refn.fs_id == ref->fs_id);
+}
+
+/*
+ * Front-end to lookup the inode-cache maintained by the VFS using the ORANGEFS
+ * file handle.
+ *
+ * @sb: the file system super block instance.
+ * @ref: The ORANGEFS object for which we are trying to locate an inode structure.
+ */
+struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref *ref)
+{
+ struct inode *inode = NULL;
+ unsigned long hash;
+ int error;
+
+ hash = orangefs_handle_hash(ref);
+ inode = iget5_locked(sb, hash, orangefs_test_inode, orangefs_set_inode, ref);
+ if (!inode || !(inode->i_state & I_NEW))
+ return inode;
+
+ error = orangefs_inode_getattr(inode, 1, 0);
+ if (error) {
+ iget_failed(inode);
+ return ERR_PTR(error);
+ }
+
+ inode->i_ino = hash; /* needed for stat etc */
+ orangefs_init_iops(inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "iget handle %pU, fsid %d hash %ld i_ino %lu\n",
+ &ref->khandle,
+ ref->fs_id,
+ hash,
+ inode->i_ino);
+
+ return inode;
+}
+
+/*
+ * Allocate an inode for a newly created file and insert it into the inode hash.
+ */
+struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
+ int mode, dev_t dev, struct orangefs_object_kref *ref)
+{
+ unsigned long hash = orangefs_handle_hash(ref);
+ struct inode *inode;
+ int error;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "%s:(sb is %p | MAJOR(dev)=%u | MINOR(dev)=%u mode=%o)\n",
+ __func__,
+ sb,
+ MAJOR(dev),
+ MINOR(dev),
+ mode);
+
+ inode = new_inode(sb);
+ if (!inode)
+ return NULL;
+
+ orangefs_set_inode(inode, ref);
+ inode->i_ino = hash; /* needed for stat etc */
+
+ error = orangefs_inode_getattr(inode, 1, 0);
+ if (error)
+ goto out_iput;
+
+ orangefs_init_iops(inode);
+
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_size = PAGE_SIZE;
+ inode->i_rdev = dev;
+
+ error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
+ if (error < 0)
+ goto out_iput;
+
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "Initializing ACL's for inode %pU\n",
+ get_khandle_from_ino(inode));
+ orangefs_init_acl(inode, dir);
+ return inode;
+
+out_iput:
+ iput(inode);
+ return ERR_PTR(error);
+}
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
new file mode 100644
index 000000000000..5a60c508af4e
--- /dev/null
+++ b/fs/orangefs/namei.c
@@ -0,0 +1,462 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Linux VFS namei operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+/*
+ * Get a newly allocated inode to go with a negative dentry.
+ */
+static int orangefs_create(struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ bool exclusive)
+{
+ struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+ struct orangefs_kernel_op_s *new_op;
+ struct inode *inode;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s: %s\n",
+ __func__,
+ dentry->d_name.name);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_CREATE);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.create.parent_refn = parent->refn;
+
+ fill_default_sys_attrs(new_op->upcall.req.create.attributes,
+ ORANGEFS_TYPE_METAFILE, mode);
+
+ strncpy(new_op->upcall.req.create.d_name,
+ dentry->d_name.name, ORANGEFS_NAME_MAX);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: %s: handle:%pU: fsid:%d: new_op:%p: ret:%d:\n",
+ __func__,
+ dentry->d_name.name,
+ &new_op->downcall.resp.create.refn.khandle,
+ new_op->downcall.resp.create.refn.fs_id,
+ new_op,
+ ret);
+
+ if (ret < 0)
+ goto out;
+
+ inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0,
+ &new_op->downcall.resp.create.refn);
+ if (IS_ERR(inode)) {
+ gossip_err("%s: Failed to allocate inode for file :%s:\n",
+ __func__,
+ dentry->d_name.name);
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: Assigned inode :%pU: for file :%s:\n",
+ __func__,
+ get_khandle_from_ino(inode),
+ dentry->d_name.name);
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: dentry instantiated for %s\n",
+ __func__,
+ dentry->d_name.name);
+
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+ ret = 0;
+out:
+ op_release(new_op);
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: %s: returning %d\n",
+ __func__,
+ dentry->d_name.name,
+ ret);
+ return ret;
+}
+
+/*
+ * Attempt to resolve an object name (dentry->d_name), parent handle, and
+ * fsid into a handle for the object.
+ */
+static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+ struct orangefs_kernel_op_s *new_op;
+ struct inode *inode;
+ struct dentry *res;
+ int ret = -EINVAL;
+
+ /*
+ * in theory we could skip a lookup here (if the intent is to
+ * create) in order to avoid a potentially failed lookup, but
+ * leaving it in can skip a valid lookup and try to create a file
+ * that already exists (e.g. the vfs already handles checking for
+ * -EEXIST on O_EXCL opens, which is broken if we skip this lookup
+ * in the create path)
+ */
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s called on %s\n",
+ __func__, dentry->d_name.name);
+
+ if (dentry->d_name.len > (ORANGEFS_NAME_MAX - 1))
+ return ERR_PTR(-ENAMETOOLONG);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP);
+ if (!new_op)
+ return ERR_PTR(-ENOMEM);
+
+ new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW;
+
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d using parent %pU\n",
+ __FILE__,
+ __func__,
+ __LINE__,
+ &parent->refn.khandle);
+ new_op->upcall.req.lookup.parent_refn = parent->refn;
+
+ strncpy(new_op->upcall.req.lookup.d_name, dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: doing lookup on %s under %pU,%d\n",
+ __func__,
+ new_op->upcall.req.lookup.d_name,
+ &new_op->upcall.req.lookup.parent_refn.khandle,
+ new_op->upcall.req.lookup.parent_refn.fs_id);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Lookup Got %pU, fsid %d (ret=%d)\n",
+ &new_op->downcall.resp.lookup.refn.khandle,
+ new_op->downcall.resp.lookup.refn.fs_id,
+ ret);
+
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ /*
+ * if no inode was found, add a negative dentry to
+ * dcache anyway; if we don't, we don't hold expected
+ * lookup semantics and we most noticeably break
+ * during directory renames.
+ *
+ * however, if the operation failed or exited, do not
+ * add the dentry (e.g. in the case that a touch is
+ * issued on a file that already exists that was
+ * interrupted during this lookup -- no need to add
+ * another negative dentry for an existing file)
+ */
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "orangefs_lookup: Adding *negative* dentry "
+ "%p for %s\n",
+ dentry,
+ dentry->d_name.name);
+
+ d_add(dentry, NULL);
+ res = NULL;
+ goto out;
+ }
+
+ /* must be a non-recoverable error */
+ res = ERR_PTR(ret);
+ goto out;
+ }
+
+ inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
+ if (IS_ERR(inode)) {
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "error %ld from iget\n", PTR_ERR(inode));
+ res = ERR_CAST(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s:%s:%d "
+ "Found good inode [%lu] with count [%d]\n",
+ __FILE__,
+ __func__,
+ __LINE__,
+ inode->i_ino,
+ (int)atomic_read(&inode->i_count));
+
+ /* update dentry/inode pair into dcache */
+ res = d_splice_alias(inode, dentry);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Lookup success (inode ct = %d)\n",
+ (int)atomic_read(&inode->i_count));
+out:
+ op_release(new_op);
+ return res;
+}
+
+/* return 0 on success; non-zero otherwise */
+static int orangefs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+ struct orangefs_kernel_op_s *new_op;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: called on %s\n"
+ " (inode %pU): Parent is %pU | fs_id %d\n",
+ __func__,
+ dentry->d_name.name,
+ get_khandle_from_ino(inode),
+ &parent->refn.khandle,
+ parent->refn.fs_id);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_REMOVE);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.remove.parent_refn = parent->refn;
+ strncpy(new_op->upcall.req.remove.d_name, dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+
+ ret = service_operation(new_op, "orangefs_unlink",
+ get_interruptible_flag(inode));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: service_operation returned:%d:\n",
+ __func__,
+ ret);
+
+ op_release(new_op);
+
+ if (!ret) {
+ drop_nlink(inode);
+
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+ }
+ return ret;
+}
+
+static int orangefs_symlink(struct inode *dir,
+ struct dentry *dentry,
+ const char *symname)
+{
+ struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+ struct orangefs_kernel_op_s *new_op;
+ struct inode *inode;
+ int mode = 755;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG, "%s: called\n", __func__);
+
+ if (!symname)
+ return -EINVAL;
+
+ if (strlen(symname)+1 > ORANGEFS_NAME_MAX)
+ return -ENAMETOOLONG;
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_SYMLINK);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.sym.parent_refn = parent->refn;
+
+ fill_default_sys_attrs(new_op->upcall.req.sym.attributes,
+ ORANGEFS_TYPE_SYMLINK,
+ mode);
+
+ strncpy(new_op->upcall.req.sym.entry_name,
+ dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+ strncpy(new_op->upcall.req.sym.target, symname, ORANGEFS_NAME_MAX);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Symlink Got ORANGEFS handle %pU on fsid %d (ret=%d)\n",
+ &new_op->downcall.resp.sym.refn.khandle,
+ new_op->downcall.resp.sym.refn.fs_id, ret);
+
+ if (ret < 0) {
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: failed with error code %d\n",
+ __func__, ret);
+ goto out;
+ }
+
+ inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0,
+ &new_op->downcall.resp.sym.refn);
+ if (IS_ERR(inode)) {
+ gossip_err
+ ("*** Failed to allocate orangefs symlink inode\n");
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Assigned symlink inode new number of %pU\n",
+ get_khandle_from_ino(inode));
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Inode (Symlink) %pU -> %s\n",
+ get_khandle_from_ino(inode),
+ dentry->d_name.name);
+
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+ ret = 0;
+out:
+ op_release(new_op);
+ return ret;
+}
+
+static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct orangefs_inode_s *parent = ORANGEFS_I(dir);
+ struct orangefs_kernel_op_s *new_op;
+ struct inode *inode;
+ int ret;
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.mkdir.parent_refn = parent->refn;
+
+ fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes,
+ ORANGEFS_TYPE_DIRECTORY, mode);
+
+ strncpy(new_op->upcall.req.mkdir.d_name,
+ dentry->d_name.name, ORANGEFS_NAME_MAX);
+
+ ret = service_operation(new_op, __func__, get_interruptible_flag(dir));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Mkdir Got ORANGEFS handle %pU on fsid %d\n",
+ &new_op->downcall.resp.mkdir.refn.khandle,
+ new_op->downcall.resp.mkdir.refn.fs_id);
+
+ if (ret < 0) {
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "%s: failed with error code %d\n",
+ __func__, ret);
+ goto out;
+ }
+
+ inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0,
+ &new_op->downcall.resp.mkdir.refn);
+ if (IS_ERR(inode)) {
+ gossip_err("*** Failed to allocate orangefs dir inode\n");
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Assigned dir inode new number of %pU\n",
+ get_khandle_from_ino(inode));
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "Inode (Directory) %pU -> %s\n",
+ get_khandle_from_ino(inode),
+ dentry->d_name.name);
+
+ /*
+ * NOTE: we have no good way to keep nlink consistent for directories
+ * across clients; keep constant at 1.
+ */
+ SetMtimeFlag(parent);
+ dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
+ mark_inode_dirty_sync(dir);
+out:
+ op_release(new_op);
+ return ret;
+}
+
+static int orangefs_rename(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry)
+{
+ struct orangefs_kernel_op_s *new_op;
+ int ret;
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "orangefs_rename: called (%s/%s => %s/%s) ct=%d\n",
+ old_dentry->d_parent->d_name.name,
+ old_dentry->d_name.name,
+ new_dentry->d_parent->d_name.name,
+ new_dentry->d_name.name,
+ d_count(new_dentry));
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
+ if (!new_op)
+ return -EINVAL;
+
+ new_op->upcall.req.rename.old_parent_refn = ORANGEFS_I(old_dir)->refn;
+ new_op->upcall.req.rename.new_parent_refn = ORANGEFS_I(new_dir)->refn;
+
+ strncpy(new_op->upcall.req.rename.d_old_name,
+ old_dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+ strncpy(new_op->upcall.req.rename.d_new_name,
+ new_dentry->d_name.name,
+ ORANGEFS_NAME_MAX);
+
+ ret = service_operation(new_op,
+ "orangefs_rename",
+ get_interruptible_flag(old_dentry->d_inode));
+
+ gossip_debug(GOSSIP_NAME_DEBUG,
+ "orangefs_rename: got downcall status %d\n",
+ ret);
+
+ if (new_dentry->d_inode)
+ new_dentry->d_inode->i_ctime = CURRENT_TIME;
+
+ op_release(new_op);
+ return ret;
+}
+
+/* ORANGEFS implementation of VFS inode operations for directories */
+struct inode_operations orangefs_dir_inode_operations = {
+ .lookup = orangefs_lookup,
+ .get_acl = orangefs_get_acl,
+ .set_acl = orangefs_set_acl,
+ .create = orangefs_create,
+ .unlink = orangefs_unlink,
+ .symlink = orangefs_symlink,
+ .mkdir = orangefs_mkdir,
+ .rmdir = orangefs_unlink,
+ .rename = orangefs_rename,
+ .setattr = orangefs_setattr,
+ .getattr = orangefs_getattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = orangefs_listxattr,
+ .permission = orangefs_permission,
+};
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
new file mode 100644
index 000000000000..75375e90a63f
--- /dev/null
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -0,0 +1,556 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+struct slot_map {
+ int c;
+ wait_queue_head_t q;
+ int count;
+ unsigned long *map;
+};
+
+static struct slot_map rw_map = {
+ .c = -1,
+ .q = __WAIT_QUEUE_HEAD_INITIALIZER(rw_map.q)
+};
+static struct slot_map readdir_map = {
+ .c = -1,
+ .q = __WAIT_QUEUE_HEAD_INITIALIZER(readdir_map.q)
+};
+
+
+static void install(struct slot_map *m, int count, unsigned long *map)
+{
+ spin_lock(&m->q.lock);
+ m->c = m->count = count;
+ m->map = map;
+ wake_up_all_locked(&m->q);
+ spin_unlock(&m->q.lock);
+}
+
+static void mark_killed(struct slot_map *m)
+{
+ spin_lock(&m->q.lock);
+ m->c -= m->count + 1;
+ spin_unlock(&m->q.lock);
+}
+
+static void run_down(struct slot_map *m)
+{
+ DEFINE_WAIT(wait);
+ spin_lock(&m->q.lock);
+ if (m->c != -1) {
+ for (;;) {
+ if (likely(list_empty(&wait.task_list)))
+ __add_wait_queue_tail(&m->q, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ if (m->c == -1)
+ break;
+
+ spin_unlock(&m->q.lock);
+ schedule();
+ spin_lock(&m->q.lock);
+ }
+ __remove_wait_queue(&m->q, &wait);
+ __set_current_state(TASK_RUNNING);
+ }
+ m->map = NULL;
+ spin_unlock(&m->q.lock);
+}
+
+static void put(struct slot_map *m, int slot)
+{
+ int v;
+ spin_lock(&m->q.lock);
+ __clear_bit(slot, m->map);
+ v = ++m->c;
+ if (unlikely(v == 1)) /* no free slots -> one free slot */
+ wake_up_locked(&m->q);
+ else if (unlikely(v == -1)) /* finished dying */
+ wake_up_all_locked(&m->q);
+ spin_unlock(&m->q.lock);
+}
+
+static int wait_for_free(struct slot_map *m)
+{
+ long left = slot_timeout_secs * HZ;
+ DEFINE_WAIT(wait);
+
+ do {
+ long n = left, t;
+ if (likely(list_empty(&wait.task_list)))
+ __add_wait_queue_tail_exclusive(&m->q, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ if (m->c > 0)
+ break;
+
+ if (m->c < 0) {
+ /* we are waiting for map to be installed */
+ /* it would better be there soon, or we go away */
+ if (n > ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ)
+ n = ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ;
+ }
+ spin_unlock(&m->q.lock);
+ t = schedule_timeout(n);
+ spin_lock(&m->q.lock);
+ if (unlikely(!t) && n != left && m->c < 0)
+ left = t;
+ else
+ left = t + (left - n);
+ if (unlikely(signal_pending(current)))
+ left = -EINTR;
+ } while (left > 0);
+
+ if (!list_empty(&wait.task_list))
+ list_del(&wait.task_list);
+ else if (left <= 0 && waitqueue_active(&m->q))
+ __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL);
+ __set_current_state(TASK_RUNNING);
+
+ if (likely(left > 0))
+ return 0;
+
+ return left < 0 ? -EINTR : -ETIMEDOUT;
+}
+
+static int get(struct slot_map *m)
+{
+ int res = 0;
+ spin_lock(&m->q.lock);
+ if (unlikely(m->c <= 0))
+ res = wait_for_free(m);
+ if (likely(!res)) {
+ m->c--;
+ res = find_first_zero_bit(m->map, m->count);
+ __set_bit(res, m->map);
+ }
+ spin_unlock(&m->q.lock);
+ return res;
+}
+
+/* used to describe mapped buffers */
+struct orangefs_bufmap_desc {
+ void *uaddr; /* user space address pointer */
+ struct page **page_array; /* array of mapped pages */
+ int array_count; /* size of above arrays */
+ struct list_head list_link;
+};
+
+static struct orangefs_bufmap {
+ int desc_size;
+ int desc_shift;
+ int desc_count;
+ int total_size;
+ int page_count;
+
+ struct page **page_array;
+ struct orangefs_bufmap_desc *desc_array;
+
+ /* array to track usage of buffer descriptors */
+ unsigned long *buffer_index_array;
+
+ /* array to track usage of buffer descriptors for readdir */
+#define N DIV_ROUND_UP(ORANGEFS_READDIR_DEFAULT_DESC_COUNT, BITS_PER_LONG)
+ unsigned long readdir_index_array[N];
+#undef N
+} *__orangefs_bufmap;
+
+static DEFINE_SPINLOCK(orangefs_bufmap_lock);
+
+static void
+orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap)
+{
+ int i;
+
+ for (i = 0; i < bufmap->page_count; i++)
+ put_page(bufmap->page_array[i]);
+}
+
+static void
+orangefs_bufmap_free(struct orangefs_bufmap *bufmap)
+{
+ kfree(bufmap->page_array);
+ kfree(bufmap->desc_array);
+ kfree(bufmap->buffer_index_array);
+ kfree(bufmap);
+}
+
+/*
+ * XXX: Can the size and shift change while the caller gives up the
+ * XXX: lock between calling this and doing something useful?
+ */
+
+int orangefs_bufmap_size_query(void)
+{
+ struct orangefs_bufmap *bufmap;
+ int size = 0;
+ spin_lock(&orangefs_bufmap_lock);
+ bufmap = __orangefs_bufmap;
+ if (bufmap)
+ size = bufmap->desc_size;
+ spin_unlock(&orangefs_bufmap_lock);
+ return size;
+}
+
+int orangefs_bufmap_shift_query(void)
+{
+ struct orangefs_bufmap *bufmap;
+ int shift = 0;
+ spin_lock(&orangefs_bufmap_lock);
+ bufmap = __orangefs_bufmap;
+ if (bufmap)
+ shift = bufmap->desc_shift;
+ spin_unlock(&orangefs_bufmap_lock);
+ return shift;
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq);
+static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq);
+
+/*
+ * orangefs_get_bufmap_init
+ *
+ * If bufmap_init is 1, then the shared memory system, including the
+ * buffer_index_array, is available. Otherwise, it is not.
+ *
+ * returns the value of bufmap_init
+ */
+int orangefs_get_bufmap_init(void)
+{
+ return __orangefs_bufmap ? 1 : 0;
+}
+
+
+static struct orangefs_bufmap *
+orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc)
+{
+ struct orangefs_bufmap *bufmap;
+
+ bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL);
+ if (!bufmap)
+ goto out;
+
+ bufmap->total_size = user_desc->total_size;
+ bufmap->desc_count = user_desc->count;
+ bufmap->desc_size = user_desc->size;
+ bufmap->desc_shift = ilog2(bufmap->desc_size);
+
+ bufmap->buffer_index_array =
+ kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL);
+ if (!bufmap->buffer_index_array) {
+ gossip_err("orangefs: could not allocate %d buffer indices\n",
+ bufmap->desc_count);
+ goto out_free_bufmap;
+ }
+
+ bufmap->desc_array =
+ kcalloc(bufmap->desc_count, sizeof(struct orangefs_bufmap_desc),
+ GFP_KERNEL);
+ if (!bufmap->desc_array) {
+ gossip_err("orangefs: could not allocate %d descriptors\n",
+ bufmap->desc_count);
+ goto out_free_index_array;
+ }
+
+ bufmap->page_count = bufmap->total_size / PAGE_SIZE;
+
+ /* allocate storage to track our page mappings */
+ bufmap->page_array =
+ kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL);
+ if (!bufmap->page_array)
+ goto out_free_desc_array;
+
+ return bufmap;
+
+out_free_desc_array:
+ kfree(bufmap->desc_array);
+out_free_index_array:
+ kfree(bufmap->buffer_index_array);
+out_free_bufmap:
+ kfree(bufmap);
+out:
+ return NULL;
+}
+
+static int
+orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
+ struct ORANGEFS_dev_map_desc *user_desc)
+{
+ int pages_per_desc = bufmap->desc_size / PAGE_SIZE;
+ int offset = 0, ret, i;
+
+ /* map the pages */
+ ret = get_user_pages_fast((unsigned long)user_desc->ptr,
+ bufmap->page_count, 1, bufmap->page_array);
+
+ if (ret < 0)
+ return ret;
+
+ if (ret != bufmap->page_count) {
+ gossip_err("orangefs error: asked for %d pages, only got %d.\n",
+ bufmap->page_count, ret);
+
+ for (i = 0; i < ret; i++) {
+ SetPageError(bufmap->page_array[i]);
+ put_page(bufmap->page_array[i]);
+ }
+ return -ENOMEM;
+ }
+
+ /*
+ * ideally we want to get kernel space pointers for each page, but
+ * we can't kmap that many pages at once if highmem is being used.
+ * so instead, we just kmap/kunmap the page address each time the
+ * kaddr is needed.
+ */
+ for (i = 0; i < bufmap->page_count; i++)
+ flush_dcache_page(bufmap->page_array[i]);
+
+ /* build a list of available descriptors */
+ for (offset = 0, i = 0; i < bufmap->desc_count; i++) {
+ bufmap->desc_array[i].page_array = &bufmap->page_array[offset];
+ bufmap->desc_array[i].array_count = pages_per_desc;
+ bufmap->desc_array[i].uaddr =
+ (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE));
+ offset += pages_per_desc;
+ }
+
+ return 0;
+}
+
+/*
+ * orangefs_bufmap_initialize()
+ *
+ * initializes the mapped buffer interface
+ *
+ * returns 0 on success, -errno on failure
+ */
+int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc)
+{
+ struct orangefs_bufmap *bufmap;
+ int ret = -EINVAL;
+
+ gossip_debug(GOSSIP_BUFMAP_DEBUG,
+ "orangefs_bufmap_initialize: called (ptr ("
+ "%p) sz (%d) cnt(%d).\n",
+ user_desc->ptr,
+ user_desc->size,
+ user_desc->count);
+
+ /*
+ * sanity check alignment and size of buffer that caller wants to
+ * work with
+ */
+ if (PAGE_ALIGN((unsigned long)user_desc->ptr) !=
+ (unsigned long)user_desc->ptr) {
+ gossip_err("orangefs error: memory alignment (front). %p\n",
+ user_desc->ptr);
+ goto out;
+ }
+
+ if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size))
+ != (unsigned long)(user_desc->ptr + user_desc->total_size)) {
+ gossip_err("orangefs error: memory alignment (back).(%p + %d)\n",
+ user_desc->ptr,
+ user_desc->total_size);
+ goto out;
+ }
+
+ if (user_desc->total_size != (user_desc->size * user_desc->count)) {
+ gossip_err("orangefs error: user provided an oddly sized buffer: (%d, %d, %d)\n",
+ user_desc->total_size,
+ user_desc->size,
+ user_desc->count);
+ goto out;
+ }
+
+ if ((user_desc->size % PAGE_SIZE) != 0) {
+ gossip_err("orangefs error: bufmap size not page size divisible (%d).\n",
+ user_desc->size);
+ goto out;
+ }
+
+ ret = -ENOMEM;
+ bufmap = orangefs_bufmap_alloc(user_desc);
+ if (!bufmap)
+ goto out;
+
+ ret = orangefs_bufmap_map(bufmap, user_desc);
+ if (ret)
+ goto out_free_bufmap;
+
+
+ spin_lock(&orangefs_bufmap_lock);
+ if (__orangefs_bufmap) {
+ spin_unlock(&orangefs_bufmap_lock);
+ gossip_err("orangefs: error: bufmap already initialized.\n");
+ ret = -EINVAL;
+ goto out_unmap_bufmap;
+ }
+ __orangefs_bufmap = bufmap;
+ install(&rw_map,
+ bufmap->desc_count,
+ bufmap->buffer_index_array);
+ install(&readdir_map,
+ ORANGEFS_READDIR_DEFAULT_DESC_COUNT,
+ bufmap->readdir_index_array);
+ spin_unlock(&orangefs_bufmap_lock);
+
+ gossip_debug(GOSSIP_BUFMAP_DEBUG,
+ "orangefs_bufmap_initialize: exiting normally\n");
+ return 0;
+
+out_unmap_bufmap:
+ orangefs_bufmap_unmap(bufmap);
+out_free_bufmap:
+ orangefs_bufmap_free(bufmap);
+out:
+ return ret;
+}
+
+/*
+ * orangefs_bufmap_finalize()
+ *
+ * shuts down the mapped buffer interface and releases any resources
+ * associated with it
+ *
+ * no return value
+ */
+void orangefs_bufmap_finalize(void)
+{
+ struct orangefs_bufmap *bufmap = __orangefs_bufmap;
+ if (!bufmap)
+ return;
+ gossip_debug(GOSSIP_BUFMAP_DEBUG, "orangefs_bufmap_finalize: called\n");
+ mark_killed(&rw_map);
+ mark_killed(&readdir_map);
+ gossip_debug(GOSSIP_BUFMAP_DEBUG,
+ "orangefs_bufmap_finalize: exiting normally\n");
+}
+
+void orangefs_bufmap_run_down(void)
+{
+ struct orangefs_bufmap *bufmap = __orangefs_bufmap;
+ if (!bufmap)
+ return;
+ run_down(&rw_map);
+ run_down(&readdir_map);
+ spin_lock(&orangefs_bufmap_lock);
+ __orangefs_bufmap = NULL;
+ spin_unlock(&orangefs_bufmap_lock);
+ orangefs_bufmap_unmap(bufmap);
+ orangefs_bufmap_free(bufmap);
+}
+
+/*
+ * orangefs_bufmap_get()
+ *
+ * gets a free mapped buffer descriptor, will sleep until one becomes
+ * available if necessary
+ *
+ * returns slot on success, -errno on failure
+ */
+int orangefs_bufmap_get(void)
+{
+ return get(&rw_map);
+}
+
+/*
+ * orangefs_bufmap_put()
+ *
+ * returns a mapped buffer descriptor to the collection
+ *
+ * no return value
+ */
+void orangefs_bufmap_put(int buffer_index)
+{
+ put(&rw_map, buffer_index);
+}
+
+/*
+ * orangefs_readdir_index_get()
+ *
+ * gets a free descriptor, will sleep until one becomes
+ * available if necessary.
+ * Although the readdir buffers are not mapped into kernel space
+ * we could do that at a later point of time. Regardless, these
+ * indices are used by the client-core.
+ *
+ * returns slot on success, -errno on failure
+ */
+int orangefs_readdir_index_get(void)
+{
+ return get(&readdir_map);
+}
+
+void orangefs_readdir_index_put(int buffer_index)
+{
+ put(&readdir_map, buffer_index);
+}
+
+/*
+ * we've been handed an iovec, we need to copy it to
+ * the shared memory descriptor at "buffer_index".
+ */
+int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
+ int buffer_index,
+ size_t size)
+{
+ struct orangefs_bufmap_desc *to;
+ int i;
+
+ gossip_debug(GOSSIP_BUFMAP_DEBUG,
+ "%s: buffer_index:%d: size:%zu:\n",
+ __func__, buffer_index, size);
+
+ to = &__orangefs_bufmap->desc_array[buffer_index];
+ for (i = 0; size; i++) {
+ struct page *page = to->page_array[i];
+ size_t n = size;
+ if (n > PAGE_SIZE)
+ n = PAGE_SIZE;
+ n = copy_page_from_iter(page, 0, n, iter);
+ if (!n)
+ return -EFAULT;
+ size -= n;
+ }
+ return 0;
+
+}
+
+/*
+ * we've been handed an iovec, we need to fill it from
+ * the shared memory descriptor at "buffer_index".
+ */
+int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
+ int buffer_index,
+ size_t size)
+{
+ struct orangefs_bufmap_desc *from;
+ int i;
+
+ from = &__orangefs_bufmap->desc_array[buffer_index];
+ gossip_debug(GOSSIP_BUFMAP_DEBUG,
+ "%s: buffer_index:%d: size:%zu:\n",
+ __func__, buffer_index, size);
+
+
+ for (i = 0; size; i++) {
+ struct page *page = from->page_array[i];
+ size_t n = size;
+ if (n > PAGE_SIZE)
+ n = PAGE_SIZE;
+ n = copy_page_to_iter(page, 0, n, iter);
+ if (!n)
+ return -EFAULT;
+ size -= n;
+ }
+ return 0;
+}
diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h
new file mode 100644
index 000000000000..71f64f4057b5
--- /dev/null
+++ b/fs/orangefs/orangefs-bufmap.h
@@ -0,0 +1,36 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#ifndef __ORANGEFS_BUFMAP_H
+#define __ORANGEFS_BUFMAP_H
+
+int orangefs_bufmap_size_query(void);
+
+int orangefs_bufmap_shift_query(void);
+
+int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc);
+
+void orangefs_bufmap_finalize(void);
+
+void orangefs_bufmap_run_down(void);
+
+int orangefs_bufmap_get(void);
+
+void orangefs_bufmap_put(int buffer_index);
+
+int orangefs_readdir_index_get(void);
+
+void orangefs_readdir_index_put(int buffer_index);
+
+int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
+ int buffer_index,
+ size_t size);
+
+int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
+ int buffer_index,
+ size_t size);
+
+#endif /* __ORANGEFS_BUFMAP_H */
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
new file mode 100644
index 000000000000..900a2e38e11b
--- /dev/null
+++ b/fs/orangefs/orangefs-cache.c
@@ -0,0 +1,161 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+/* tags assigned to kernel upcall operations */
+static __u64 next_tag_value;
+static DEFINE_SPINLOCK(next_tag_value_lock);
+
+/* the orangefs memory caches */
+
+/* a cache for orangefs upcall/downcall operations */
+static struct kmem_cache *op_cache;
+
+int op_cache_initialize(void)
+{
+ op_cache = kmem_cache_create("orangefs_op_cache",
+ sizeof(struct orangefs_kernel_op_s),
+ 0,
+ ORANGEFS_CACHE_CREATE_FLAGS,
+ NULL);
+
+ if (!op_cache) {
+ gossip_err("Cannot create orangefs_op_cache\n");
+ return -ENOMEM;
+ }
+
+ /* initialize our atomic tag counter */
+ spin_lock(&next_tag_value_lock);
+ next_tag_value = 100;
+ spin_unlock(&next_tag_value_lock);
+ return 0;
+}
+
+int op_cache_finalize(void)
+{
+ kmem_cache_destroy(op_cache);
+ return 0;
+}
+
+char *get_opname_string(struct orangefs_kernel_op_s *new_op)
+{
+ if (new_op) {
+ __s32 type = new_op->upcall.type;
+
+ if (type == ORANGEFS_VFS_OP_FILE_IO)
+ return "OP_FILE_IO";
+ else if (type == ORANGEFS_VFS_OP_LOOKUP)
+ return "OP_LOOKUP";
+ else if (type == ORANGEFS_VFS_OP_CREATE)
+ return "OP_CREATE";
+ else if (type == ORANGEFS_VFS_OP_GETATTR)
+ return "OP_GETATTR";
+ else if (type == ORANGEFS_VFS_OP_REMOVE)
+ return "OP_REMOVE";
+ else if (type == ORANGEFS_VFS_OP_MKDIR)
+ return "OP_MKDIR";
+ else if (type == ORANGEFS_VFS_OP_READDIR)
+ return "OP_READDIR";
+ else if (type == ORANGEFS_VFS_OP_READDIRPLUS)
+ return "OP_READDIRPLUS";
+ else if (type == ORANGEFS_VFS_OP_SETATTR)
+ return "OP_SETATTR";
+ else if (type == ORANGEFS_VFS_OP_SYMLINK)
+ return "OP_SYMLINK";
+ else if (type == ORANGEFS_VFS_OP_RENAME)
+ return "OP_RENAME";
+ else if (type == ORANGEFS_VFS_OP_STATFS)
+ return "OP_STATFS";
+ else if (type == ORANGEFS_VFS_OP_TRUNCATE)
+ return "OP_TRUNCATE";
+ else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH)
+ return "OP_MMAP_RA_FLUSH";
+ else if (type == ORANGEFS_VFS_OP_FS_MOUNT)
+ return "OP_FS_MOUNT";
+ else if (type == ORANGEFS_VFS_OP_FS_UMOUNT)
+ return "OP_FS_UMOUNT";
+ else if (type == ORANGEFS_VFS_OP_GETXATTR)
+ return "OP_GETXATTR";
+ else if (type == ORANGEFS_VFS_OP_SETXATTR)
+ return "OP_SETXATTR";
+ else if (type == ORANGEFS_VFS_OP_LISTXATTR)
+ return "OP_LISTXATTR";
+ else if (type == ORANGEFS_VFS_OP_REMOVEXATTR)
+ return "OP_REMOVEXATTR";
+ else if (type == ORANGEFS_VFS_OP_PARAM)
+ return "OP_PARAM";
+ else if (type == ORANGEFS_VFS_OP_PERF_COUNT)
+ return "OP_PERF_COUNT";
+ else if (type == ORANGEFS_VFS_OP_CANCEL)
+ return "OP_CANCEL";
+ else if (type == ORANGEFS_VFS_OP_FSYNC)
+ return "OP_FSYNC";
+ else if (type == ORANGEFS_VFS_OP_FSKEY)
+ return "OP_FSKEY";
+ }
+ return "OP_UNKNOWN?";
+}
+
+void orangefs_new_tag(struct orangefs_kernel_op_s *op)
+{
+ spin_lock(&next_tag_value_lock);
+ op->tag = next_tag_value++;
+ if (next_tag_value == 0)
+ next_tag_value = 100;
+ spin_unlock(&next_tag_value_lock);
+}
+
+struct orangefs_kernel_op_s *op_alloc(__s32 type)
+{
+ struct orangefs_kernel_op_s *new_op = NULL;
+
+ new_op = kmem_cache_zalloc(op_cache, GFP_KERNEL);
+ if (new_op) {
+ INIT_LIST_HEAD(&new_op->list);
+ spin_lock_init(&new_op->lock);
+ init_completion(&new_op->waitq);
+
+ new_op->upcall.type = ORANGEFS_VFS_OP_INVALID;
+ new_op->downcall.type = ORANGEFS_VFS_OP_INVALID;
+ new_op->downcall.status = -1;
+
+ new_op->op_state = OP_VFS_STATE_UNKNOWN;
+
+ /* initialize the op specific tag and upcall credentials */
+ orangefs_new_tag(new_op);
+ new_op->upcall.type = type;
+ new_op->attempts = 0;
+ gossip_debug(GOSSIP_CACHE_DEBUG,
+ "Alloced OP (%p: %llu %s)\n",
+ new_op,
+ llu(new_op->tag),
+ get_opname_string(new_op));
+
+ new_op->upcall.uid = from_kuid(current_user_ns(),
+ current_fsuid());
+
+ new_op->upcall.gid = from_kgid(current_user_ns(),
+ current_fsgid());
+ } else {
+ gossip_err("op_alloc: kmem_cache_zalloc failed!\n");
+ }
+ return new_op;
+}
+
+void op_release(struct orangefs_kernel_op_s *orangefs_op)
+{
+ if (orangefs_op) {
+ gossip_debug(GOSSIP_CACHE_DEBUG,
+ "Releasing OP (%p: %llu)\n",
+ orangefs_op,
+ llu(orangefs_op->tag));
+ kmem_cache_free(op_cache, orangefs_op);
+ } else {
+ gossip_err("NULL pointer in op_release\n");
+ }
+}
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
new file mode 100644
index 000000000000..387db17cde2b
--- /dev/null
+++ b/fs/orangefs/orangefs-debug.h
@@ -0,0 +1,92 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* This file just defines debugging masks to be used with the gossip
+ * logging utility. All debugging masks for ORANGEFS are kept here to make
+ * sure we don't have collisions.
+ */
+
+#ifndef __ORANGEFS_DEBUG_H
+#define __ORANGEFS_DEBUG_H
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+#define GOSSIP_NO_DEBUG (__u64)0
+
+#define GOSSIP_SUPER_DEBUG ((__u64)1 << 0)
+#define GOSSIP_INODE_DEBUG ((__u64)1 << 1)
+#define GOSSIP_FILE_DEBUG ((__u64)1 << 2)
+#define GOSSIP_DIR_DEBUG ((__u64)1 << 3)
+#define GOSSIP_UTILS_DEBUG ((__u64)1 << 4)
+#define GOSSIP_WAIT_DEBUG ((__u64)1 << 5)
+#define GOSSIP_ACL_DEBUG ((__u64)1 << 6)
+#define GOSSIP_DCACHE_DEBUG ((__u64)1 << 7)
+#define GOSSIP_DEV_DEBUG ((__u64)1 << 8)
+#define GOSSIP_NAME_DEBUG ((__u64)1 << 9)
+#define GOSSIP_BUFMAP_DEBUG ((__u64)1 << 10)
+#define GOSSIP_CACHE_DEBUG ((__u64)1 << 11)
+#define GOSSIP_DEBUGFS_DEBUG ((__u64)1 << 12)
+#define GOSSIP_XATTR_DEBUG ((__u64)1 << 13)
+#define GOSSIP_INIT_DEBUG ((__u64)1 << 14)
+#define GOSSIP_SYSFS_DEBUG ((__u64)1 << 15)
+
+#define GOSSIP_MAX_NR 16
+#define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1)
+
+/*function prototypes*/
+__u64 ORANGEFS_kmod_eventlog_to_mask(const char *event_logging);
+__u64 ORANGEFS_debug_eventlog_to_mask(const char *event_logging);
+char *ORANGEFS_debug_mask_to_eventlog(__u64 mask);
+char *ORANGEFS_kmod_mask_to_eventlog(__u64 mask);
+
+/* a private internal type */
+struct __keyword_mask_s {
+ const char *keyword;
+ __u64 mask_val;
+};
+
+/*
+ * Map all kmod keywords to kmod debug masks here. Keep this
+ * structure "packed":
+ *
+ * "all" is always last...
+ *
+ * keyword mask_val index
+ * foo 1 0
+ * bar 2 1
+ * baz 4 2
+ * qux 8 3
+ * . . .
+ */
+static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
+ {"super", GOSSIP_SUPER_DEBUG},
+ {"inode", GOSSIP_INODE_DEBUG},
+ {"file", GOSSIP_FILE_DEBUG},
+ {"dir", GOSSIP_DIR_DEBUG},
+ {"utils", GOSSIP_UTILS_DEBUG},
+ {"wait", GOSSIP_WAIT_DEBUG},
+ {"acl", GOSSIP_ACL_DEBUG},
+ {"dcache", GOSSIP_DCACHE_DEBUG},
+ {"dev", GOSSIP_DEV_DEBUG},
+ {"name", GOSSIP_NAME_DEBUG},
+ {"bufmap", GOSSIP_BUFMAP_DEBUG},
+ {"cache", GOSSIP_CACHE_DEBUG},
+ {"debugfs", GOSSIP_DEBUGFS_DEBUG},
+ {"xattr", GOSSIP_XATTR_DEBUG},
+ {"init", GOSSIP_INIT_DEBUG},
+ {"sysfs", GOSSIP_SYSFS_DEBUG},
+ {"none", GOSSIP_NO_DEBUG},
+ {"all", GOSSIP_MAX_DEBUG}
+};
+
+static const int num_kmod_keyword_mask_map = (int)
+ (sizeof(s_kmod_keyword_mask_map) / sizeof(struct __keyword_mask_s));
+
+#endif /* __ORANGEFS_DEBUG_H */
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
new file mode 100644
index 000000000000..1714a737d556
--- /dev/null
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -0,0 +1,454 @@
+/*
+ * What: /sys/kernel/debug/orangefs/debug-help
+ * Date: June 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * List of client and kernel debug keywords.
+ *
+ *
+ * What: /sys/kernel/debug/orangefs/client-debug
+ * Date: June 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Debug setting for "the client", the userspace
+ * helper for the kernel module.
+ *
+ *
+ * What: /sys/kernel/debug/orangefs/kernel-debug
+ * Date: June 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Debug setting for the orangefs kernel module.
+ *
+ * Any of the keywords, or comma-separated lists
+ * of keywords, from debug-help can be catted to
+ * client-debug or kernel-debug.
+ *
+ * "none", "all" and "verbose" are special keywords
+ * for client-debug. Setting client-debug to "all"
+ * is kind of like trying to drink water from a
+ * fire hose, "verbose" triggers most of the same
+ * output except for the constant flow of output
+ * from the main wait loop.
+ *
+ * "none" and "all" are similar settings for kernel-debug
+ * no need for a "verbose".
+ */
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+#include <linux/uaccess.h>
+
+#include "orangefs-debugfs.h"
+#include "protocol.h"
+#include "orangefs-kernel.h"
+
+static int orangefs_debug_disabled = 1;
+
+static int orangefs_debug_help_open(struct inode *, struct file *);
+
+const struct file_operations debug_help_fops = {
+ .open = orangefs_debug_help_open,
+ .read = seq_read,
+ .release = seq_release,
+ .llseek = seq_lseek,
+};
+
+static void *help_start(struct seq_file *, loff_t *);
+static void *help_next(struct seq_file *, void *, loff_t *);
+static void help_stop(struct seq_file *, void *);
+static int help_show(struct seq_file *, void *);
+
+static const struct seq_operations help_debug_ops = {
+ .start = help_start,
+ .next = help_next,
+ .stop = help_stop,
+ .show = help_show,
+};
+
+/*
+ * Used to protect data in ORANGEFS_KMOD_DEBUG_FILE and
+ * ORANGEFS_KMOD_DEBUG_FILE.
+ */
+static DEFINE_MUTEX(orangefs_debug_lock);
+
+int orangefs_debug_open(struct inode *, struct file *);
+
+static ssize_t orangefs_debug_read(struct file *,
+ char __user *,
+ size_t,
+ loff_t *);
+
+static ssize_t orangefs_debug_write(struct file *,
+ const char __user *,
+ size_t,
+ loff_t *);
+
+static const struct file_operations kernel_debug_fops = {
+ .open = orangefs_debug_open,
+ .read = orangefs_debug_read,
+ .write = orangefs_debug_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * initialize kmod debug operations, create orangefs debugfs dir and
+ * ORANGEFS_KMOD_DEBUG_HELP_FILE.
+ */
+int orangefs_debugfs_init(void)
+{
+
+ int rc = -ENOMEM;
+
+ debug_dir = debugfs_create_dir("orangefs", NULL);
+ if (!debug_dir) {
+ pr_info("%s: debugfs_create_dir failed.\n", __func__);
+ goto out;
+ }
+
+ help_file_dentry = debugfs_create_file(ORANGEFS_KMOD_DEBUG_HELP_FILE,
+ 0444,
+ debug_dir,
+ debug_help_string,
+ &debug_help_fops);
+ if (!help_file_dentry) {
+ pr_info("%s: debugfs_create_file failed.\n", __func__);
+ goto out;
+ }
+
+ orangefs_debug_disabled = 0;
+ rc = 0;
+
+out:
+
+ return rc;
+}
+
+void orangefs_debugfs_cleanup(void)
+{
+ debugfs_remove_recursive(debug_dir);
+}
+
+/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
+static int orangefs_debug_help_open(struct inode *inode, struct file *file)
+{
+ int rc = -ENODEV;
+ int ret;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_help_open: start\n");
+
+ if (orangefs_debug_disabled)
+ goto out;
+
+ ret = seq_open(file, &help_debug_ops);
+ if (ret)
+ goto out;
+
+ ((struct seq_file *)(file->private_data))->private = inode->i_private;
+
+ rc = 0;
+
+out:
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_help_open: rc:%d:\n",
+ rc);
+ return rc;
+}
+
+/*
+ * I think start always gets called again after stop. Start
+ * needs to return NULL when it is done. The whole "payload"
+ * in this case is a single (long) string, so by the second
+ * time we get to start (pos = 1), we're done.
+ */
+static void *help_start(struct seq_file *m, loff_t *pos)
+{
+ void *payload = NULL;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_start: start\n");
+
+ if (*pos == 0)
+ payload = m->private;
+
+ return payload;
+}
+
+static void *help_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_next: start\n");
+
+ return NULL;
+}
+
+static void help_stop(struct seq_file *m, void *p)
+{
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_stop: start\n");
+}
+
+static int help_show(struct seq_file *m, void *v)
+{
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_show: start\n");
+
+ seq_puts(m, v);
+
+ return 0;
+}
+
+/*
+ * initialize the kernel-debug file.
+ */
+int orangefs_kernel_debug_init(void)
+{
+ int rc = -ENOMEM;
+ struct dentry *ret;
+ char *k_buffer = NULL;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+
+ k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+ if (!k_buffer)
+ goto out;
+
+ if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+ strcpy(k_buffer, kernel_debug_string);
+ strcat(k_buffer, "\n");
+ } else {
+ strcpy(k_buffer, "none\n");
+ pr_info("%s: overflow 1!\n", __func__);
+ }
+
+ ret = debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE,
+ 0444,
+ debug_dir,
+ k_buffer,
+ &kernel_debug_fops);
+ if (!ret) {
+ pr_info("%s: failed to create %s.\n",
+ __func__,
+ ORANGEFS_KMOD_DEBUG_FILE);
+ goto out;
+ }
+
+ rc = 0;
+
+out:
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+ return rc;
+}
+
+/*
+ * initialize the client-debug file.
+ */
+int orangefs_client_debug_init(void)
+{
+
+ int rc = -ENOMEM;
+ char *c_buffer = NULL;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
+
+ c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+ if (!c_buffer)
+ goto out;
+
+ if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
+ strcpy(c_buffer, client_debug_string);
+ strcat(c_buffer, "\n");
+ } else {
+ strcpy(c_buffer, "none\n");
+ pr_info("%s: overflow! 2\n", __func__);
+ }
+
+ client_debug_dentry = debugfs_create_file(ORANGEFS_CLIENT_DEBUG_FILE,
+ 0444,
+ debug_dir,
+ c_buffer,
+ &kernel_debug_fops);
+ if (!client_debug_dentry) {
+ pr_info("%s: failed to create updated %s.\n",
+ __func__,
+ ORANGEFS_CLIENT_DEBUG_FILE);
+ goto out;
+ }
+
+ rc = 0;
+
+out:
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+ return rc;
+}
+
+/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
+int orangefs_debug_open(struct inode *inode, struct file *file)
+{
+ int rc = -ENODEV;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "%s: orangefs_debug_disabled: %d\n",
+ __func__,
+ orangefs_debug_disabled);
+
+ if (orangefs_debug_disabled)
+ goto out;
+
+ rc = 0;
+ mutex_lock(&orangefs_debug_lock);
+ file->private_data = inode->i_private;
+ mutex_unlock(&orangefs_debug_lock);
+
+out:
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_open: rc: %d\n",
+ rc);
+ return rc;
+}
+
+static ssize_t orangefs_debug_read(struct file *file,
+ char __user *ubuf,
+ size_t count,
+ loff_t *ppos)
+{
+ char *buf;
+ int sprintf_ret;
+ ssize_t read_ret = -ENOMEM;
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG, "orangefs_debug_read: start\n");
+
+ buf = kmalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ mutex_lock(&orangefs_debug_lock);
+ sprintf_ret = sprintf(buf, "%s", (char *)file->private_data);
+ mutex_unlock(&orangefs_debug_lock);
+
+ read_ret = simple_read_from_buffer(ubuf, count, ppos, buf, sprintf_ret);
+
+ kfree(buf);
+
+out:
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_read: ret: %zu\n",
+ read_ret);
+
+ return read_ret;
+}
+
+static ssize_t orangefs_debug_write(struct file *file,
+ const char __user *ubuf,
+ size_t count,
+ loff_t *ppos)
+{
+ char *buf;
+ int rc = -EFAULT;
+ size_t silly = 0;
+ char *debug_string;
+ struct orangefs_kernel_op_s *new_op = NULL;
+ struct client_debug_mask c_mask = { NULL, 0, 0 };
+
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_write: %s\n",
+ file->f_path.dentry->d_name.name);
+
+ /*
+ * Thwart users who try to jamb a ridiculous number
+ * of bytes into the debug file...
+ */
+ if (count > ORANGEFS_MAX_DEBUG_STRING_LEN + 1) {
+ silly = count;
+ count = ORANGEFS_MAX_DEBUG_STRING_LEN + 1;
+ }
+
+ buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ if (copy_from_user(buf, ubuf, count - 1)) {
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "%s: copy_from_user failed!\n",
+ __func__);
+ goto out;
+ }
+
+ /*
+ * Map the keyword string from userspace into a valid debug mask.
+ * The mapping process involves mapping the human-inputted string
+ * into a valid mask, and then rebuilding the string from the
+ * verified valid mask.
+ *
+ * A service operation is required to set a new client-side
+ * debug mask.
+ */
+ if (!strcmp(file->f_path.dentry->d_name.name,
+ ORANGEFS_KMOD_DEBUG_FILE)) {
+ debug_string_to_mask(buf, &gossip_debug_mask, 0);
+ debug_mask_to_string(&gossip_debug_mask, 0);
+ debug_string = kernel_debug_string;
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "New kernel debug string is %s\n",
+ kernel_debug_string);
+ } else {
+ /* Can't reset client debug mask if client is not running. */
+ if (is_daemon_in_service()) {
+ pr_info("%s: Client not running :%d:\n",
+ __func__,
+ is_daemon_in_service());
+ goto out;
+ }
+
+ debug_string_to_mask(buf, &c_mask, 1);
+ debug_mask_to_string(&c_mask, 1);
+ debug_string = client_debug_string;
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
+ if (!new_op) {
+ pr_info("%s: op_alloc failed!\n", __func__);
+ goto out;
+ }
+
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES;
+ new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
+ memset(new_op->upcall.req.param.s_value,
+ 0,
+ ORANGEFS_MAX_DEBUG_STRING_LEN);
+ sprintf(new_op->upcall.req.param.s_value,
+ "%llx %llx\n",
+ c_mask.mask1,
+ c_mask.mask2);
+
+ /* service_operation returns 0 on success... */
+ rc = service_operation(new_op,
+ "orangefs_param",
+ ORANGEFS_OP_INTERRUPTIBLE);
+
+ if (rc)
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "%s: service_operation failed! rc:%d:\n",
+ __func__,
+ rc);
+
+ op_release(new_op);
+ }
+
+ mutex_lock(&orangefs_debug_lock);
+ memset(file->f_inode->i_private, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+ sprintf((char *)file->f_inode->i_private, "%s\n", debug_string);
+ mutex_unlock(&orangefs_debug_lock);
+
+ *ppos += count;
+ if (silly)
+ rc = silly;
+ else
+ rc = count;
+
+out:
+ gossip_debug(GOSSIP_DEBUGFS_DEBUG,
+ "orangefs_debug_write: rc: %d\n",
+ rc);
+ kfree(buf);
+ return rc;
+}
diff --git a/fs/orangefs/orangefs-debugfs.h b/fs/orangefs/orangefs-debugfs.h
new file mode 100644
index 000000000000..e4828c0e3ef9
--- /dev/null
+++ b/fs/orangefs/orangefs-debugfs.h
@@ -0,0 +1,3 @@
+int orangefs_debugfs_init(void);
+int orangefs_kernel_debug_init(void);
+void orangefs_debugfs_cleanup(void);
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
new file mode 100644
index 000000000000..9eac9d9a3f3a
--- /dev/null
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -0,0 +1,62 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#ifndef _ORANGEFS_DEV_PROTO_H
+#define _ORANGEFS_DEV_PROTO_H
+
+/*
+ * types and constants shared between user space and kernel space for
+ * device interaction using a common protocol
+ */
+
+/*
+ * valid orangefs kernel operation types
+ */
+#define ORANGEFS_VFS_OP_INVALID 0xFF000000
+#define ORANGEFS_VFS_OP_FILE_IO 0xFF000001
+#define ORANGEFS_VFS_OP_LOOKUP 0xFF000002
+#define ORANGEFS_VFS_OP_CREATE 0xFF000003
+#define ORANGEFS_VFS_OP_GETATTR 0xFF000004
+#define ORANGEFS_VFS_OP_REMOVE 0xFF000005
+#define ORANGEFS_VFS_OP_MKDIR 0xFF000006
+#define ORANGEFS_VFS_OP_READDIR 0xFF000007
+#define ORANGEFS_VFS_OP_SETATTR 0xFF000008
+#define ORANGEFS_VFS_OP_SYMLINK 0xFF000009
+#define ORANGEFS_VFS_OP_RENAME 0xFF00000A
+#define ORANGEFS_VFS_OP_STATFS 0xFF00000B
+#define ORANGEFS_VFS_OP_TRUNCATE 0xFF00000C
+#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH 0xFF00000D
+#define ORANGEFS_VFS_OP_FS_MOUNT 0xFF00000E
+#define ORANGEFS_VFS_OP_FS_UMOUNT 0xFF00000F
+#define ORANGEFS_VFS_OP_GETXATTR 0xFF000010
+#define ORANGEFS_VFS_OP_SETXATTR 0xFF000011
+#define ORANGEFS_VFS_OP_LISTXATTR 0xFF000012
+#define ORANGEFS_VFS_OP_REMOVEXATTR 0xFF000013
+#define ORANGEFS_VFS_OP_PARAM 0xFF000014
+#define ORANGEFS_VFS_OP_PERF_COUNT 0xFF000015
+#define ORANGEFS_VFS_OP_CANCEL 0xFF00EE00
+#define ORANGEFS_VFS_OP_FSYNC 0xFF00EE01
+#define ORANGEFS_VFS_OP_FSKEY 0xFF00EE02
+#define ORANGEFS_VFS_OP_READDIRPLUS 0xFF00EE03
+
+/*
+ * Misc constants. Please retain them as multiples of 8!
+ * Otherwise 32-64 bit interactions will be messed up :)
+ */
+#define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000400
+#define ORANGEFS_MAX_DEBUG_ARRAY_LEN 0x00000800
+
+/*
+ * The maximum number of directory entries in a single request is 96.
+ * XXX: Why can this not be higher. The client-side code can handle up to 512.
+ * XXX: What happens if we expect more than the client can return?
+ */
+#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96
+
+#include "upcall.h"
+#include "downcall.h"
+
+#endif
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
new file mode 100644
index 000000000000..a9925e296ceb
--- /dev/null
+++ b/fs/orangefs/orangefs-kernel.h
@@ -0,0 +1,623 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * The ORANGEFS Linux kernel support allows ORANGEFS volumes to be mounted and
+ * accessed through the Linux VFS (i.e. using standard I/O system calls).
+ * This support is only needed on clients that wish to mount the file system.
+ *
+ */
+
+/*
+ * Declarations and macros for the ORANGEFS Linux kernel support.
+ */
+
+#ifndef __ORANGEFSKERNEL_H
+#define __ORANGEFSKERNEL_H
+
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/statfs.h>
+#include <linux/backing-dev.h>
+#include <linux/device.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/vmalloc.h>
+
+#include <linux/aio.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/wait.h>
+#include <linux/dcache.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include <linux/exportfs.h>
+
+#include <asm/unaligned.h>
+
+#include "orangefs-dev-proto.h"
+
+#ifdef ORANGEFS_KERNEL_DEBUG
+#define ORANGEFS_DEFAULT_OP_TIMEOUT_SECS 10
+#else
+#define ORANGEFS_DEFAULT_OP_TIMEOUT_SECS 20
+#endif
+
+#define ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS 30
+
+#define ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS 900 /* 15 minutes */
+
+#define ORANGEFS_REQDEVICE_NAME "pvfs2-req"
+
+#define ORANGEFS_DEVREQ_MAGIC 0x20030529
+#define ORANGEFS_LINK_MAX 0x000000FF
+#define ORANGEFS_PURGE_RETRY_COUNT 0x00000005
+#define ORANGEFS_MAX_NUM_OPTIONS 0x00000004
+#define ORANGEFS_MAX_MOUNT_OPT_LEN 0x00000080
+#define ORANGEFS_MAX_FSKEY_LEN 64
+
+#define MAX_DEV_REQ_UPSIZE (2 * sizeof(__s32) + \
+sizeof(__u64) + sizeof(struct orangefs_upcall_s))
+#define MAX_DEV_REQ_DOWNSIZE (2 * sizeof(__s32) + \
+sizeof(__u64) + sizeof(struct orangefs_downcall_s))
+
+/*
+ * valid orangefs kernel operation states
+ *
+ * unknown - op was just initialized
+ * waiting - op is on request_list (upward bound)
+ * inprogr - op is in progress (waiting for downcall)
+ * serviced - op has matching downcall; ok
+ * purged - op has to start a timer since client-core
+ * exited uncleanly before servicing op
+ * given up - submitter has given up waiting for it
+ */
+enum orangefs_vfs_op_states {
+ OP_VFS_STATE_UNKNOWN = 0,
+ OP_VFS_STATE_WAITING = 1,
+ OP_VFS_STATE_INPROGR = 2,
+ OP_VFS_STATE_SERVICED = 4,
+ OP_VFS_STATE_PURGED = 8,
+ OP_VFS_STATE_GIVEN_UP = 16,
+};
+
+/*
+ * An array of client_debug_mask will be built to hold debug keyword/mask
+ * values fetched from userspace.
+ */
+struct client_debug_mask {
+ char *keyword;
+ __u64 mask1;
+ __u64 mask2;
+};
+
+/*
+ * orangefs kernel memory related flags
+ */
+
+#if ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB))
+#define ORANGEFS_CACHE_CREATE_FLAGS SLAB_RED_ZONE
+#else
+#define ORANGEFS_CACHE_CREATE_FLAGS 0
+#endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */
+
+/* orangefs xattr and acl related defines */
+#define ORANGEFS_XATTR_INDEX_POSIX_ACL_ACCESS 1
+#define ORANGEFS_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define ORANGEFS_XATTR_INDEX_TRUSTED 3
+#define ORANGEFS_XATTR_INDEX_DEFAULT 4
+
+#define ORANGEFS_XATTR_NAME_ACL_ACCESS XATTR_NAME_POSIX_ACL_ACCESS
+#define ORANGEFS_XATTR_NAME_ACL_DEFAULT XATTR_NAME_POSIX_ACL_DEFAULT
+#define ORANGEFS_XATTR_NAME_TRUSTED_PREFIX "trusted."
+#define ORANGEFS_XATTR_NAME_DEFAULT_PREFIX ""
+
+/* these functions are defined in orangefs-utils.c */
+int orangefs_prepare_cdm_array(char *debug_array_string);
+int orangefs_prepare_debugfs_help_string(int);
+
+/* defined in orangefs-debugfs.c */
+int orangefs_client_debug_init(void);
+
+void debug_string_to_mask(char *, void *, int);
+void do_c_mask(int, char *, struct client_debug_mask **);
+void do_k_mask(int, char *, __u64 **);
+
+void debug_mask_to_string(void *, int);
+void do_k_string(void *, int);
+void do_c_string(void *, int);
+int check_amalgam_keyword(void *, int);
+int keyword_is_amalgam(char *);
+
+/*these variables are defined in orangefs-mod.c */
+extern char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+extern char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+extern char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+extern unsigned int kernel_mask_set_mod_init;
+
+extern int orangefs_init_acl(struct inode *inode, struct inode *dir);
+extern const struct xattr_handler *orangefs_xattr_handlers[];
+
+extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type);
+extern int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+
+/*
+ * Redefine xtvec structure so that we could move helper functions out of
+ * the define
+ */
+struct xtvec {
+ __kernel_off_t xtv_off; /* must be off_t */
+ __kernel_size_t xtv_len; /* must be size_t */
+};
+
+/*
+ * orangefs data structures
+ */
+struct orangefs_kernel_op_s {
+ enum orangefs_vfs_op_states op_state;
+ __u64 tag;
+
+ /*
+ * Set uses_shared_memory to non zero if this operation uses
+ * shared memory. If true, then a retry on the op must also
+ * get a new shared memory buffer and re-populate it.
+ * Cancels don't care - it only matters for service_operation()
+ * retry logics and cancels don't go through it anymore. It
+ * safely stays non-zero when we use it as slot_to_free.
+ */
+ union {
+ int uses_shared_memory;
+ int slot_to_free;
+ };
+
+ struct orangefs_upcall_s upcall;
+ struct orangefs_downcall_s downcall;
+
+ struct completion waitq;
+ spinlock_t lock;
+
+ int attempts;
+
+ struct list_head list;
+};
+
+#define set_op_state_waiting(op) ((op)->op_state = OP_VFS_STATE_WAITING)
+#define set_op_state_inprogress(op) ((op)->op_state = OP_VFS_STATE_INPROGR)
+#define set_op_state_given_up(op) ((op)->op_state = OP_VFS_STATE_GIVEN_UP)
+static inline void set_op_state_serviced(struct orangefs_kernel_op_s *op)
+{
+ op->op_state = OP_VFS_STATE_SERVICED;
+ complete(&op->waitq);
+}
+
+#define op_state_waiting(op) ((op)->op_state & OP_VFS_STATE_WAITING)
+#define op_state_in_progress(op) ((op)->op_state & OP_VFS_STATE_INPROGR)
+#define op_state_serviced(op) ((op)->op_state & OP_VFS_STATE_SERVICED)
+#define op_state_purged(op) ((op)->op_state & OP_VFS_STATE_PURGED)
+#define op_state_given_up(op) ((op)->op_state & OP_VFS_STATE_GIVEN_UP)
+#define op_is_cancel(op) ((op)->upcall.type == ORANGEFS_VFS_OP_CANCEL)
+
+void op_release(struct orangefs_kernel_op_s *op);
+
+extern void orangefs_bufmap_put(int);
+static inline void put_cancel(struct orangefs_kernel_op_s *op)
+{
+ orangefs_bufmap_put(op->slot_to_free);
+ op_release(op);
+}
+
+static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
+{
+ spin_lock(&op->lock);
+ if (unlikely(op_is_cancel(op))) {
+ list_del_init(&op->list);
+ spin_unlock(&op->lock);
+ put_cancel(op);
+ } else {
+ op->op_state |= OP_VFS_STATE_PURGED;
+ complete(&op->waitq);
+ spin_unlock(&op->lock);
+ }
+}
+
+/* per inode private orangefs info */
+struct orangefs_inode_s {
+ struct orangefs_object_kref refn;
+ char link_target[ORANGEFS_NAME_MAX];
+ __s64 blksize;
+ /*
+ * Reading/Writing Extended attributes need to acquire the appropriate
+ * reader/writer semaphore on the orangefs_inode_s structure.
+ */
+ struct rw_semaphore xattr_sem;
+
+ struct inode vfs_inode;
+ sector_t last_failed_block_index_read;
+
+ /*
+ * State of in-memory attributes not yet flushed to disk associated
+ * with this object
+ */
+ unsigned long pinode_flags;
+};
+
+#define P_ATIME_FLAG 0
+#define P_MTIME_FLAG 1
+#define P_CTIME_FLAG 2
+#define P_MODE_FLAG 3
+
+#define ClearAtimeFlag(pinode) clear_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
+#define SetAtimeFlag(pinode) set_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
+#define AtimeFlag(pinode) test_bit(P_ATIME_FLAG, &(pinode)->pinode_flags)
+
+#define ClearMtimeFlag(pinode) clear_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
+#define SetMtimeFlag(pinode) set_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
+#define MtimeFlag(pinode) test_bit(P_MTIME_FLAG, &(pinode)->pinode_flags)
+
+#define ClearCtimeFlag(pinode) clear_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
+#define SetCtimeFlag(pinode) set_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
+#define CtimeFlag(pinode) test_bit(P_CTIME_FLAG, &(pinode)->pinode_flags)
+
+#define ClearModeFlag(pinode) clear_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
+#define SetModeFlag(pinode) set_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
+#define ModeFlag(pinode) test_bit(P_MODE_FLAG, &(pinode)->pinode_flags)
+
+/* per superblock private orangefs info */
+struct orangefs_sb_info_s {
+ struct orangefs_khandle root_khandle;
+ __s32 fs_id;
+ int id;
+ int flags;
+#define ORANGEFS_OPT_INTR 0x01
+#define ORANGEFS_OPT_LOCAL_LOCK 0x02
+ char devname[ORANGEFS_MAX_SERVER_ADDR_LEN];
+ struct super_block *sb;
+ int mount_pending;
+ struct list_head list;
+};
+
+/*
+ * structure that holds the state of any async I/O operation issued
+ * through the VFS. Needed especially to handle cancellation requests
+ * or even completion notification so that the VFS client-side daemon
+ * can free up its vfs_request slots.
+ */
+struct orangefs_kiocb_s {
+ /* the pointer to the task that initiated the AIO */
+ struct task_struct *tsk;
+
+ /* pointer to the kiocb that kicked this operation */
+ struct kiocb *kiocb;
+
+ /* buffer index that was used for the I/O */
+ struct orangefs_bufmap *bufmap;
+ int buffer_index;
+
+ /* orangefs kernel operation type */
+ struct orangefs_kernel_op_s *op;
+
+ /* The user space buffers from/to which I/O is being staged */
+ struct iovec *iov;
+
+ /* number of elements in the iovector */
+ unsigned long nr_segs;
+
+ /* set to indicate the type of the operation */
+ int rw;
+
+ /* file offset */
+ loff_t offset;
+
+ /* and the count in bytes */
+ size_t bytes_to_be_copied;
+
+ ssize_t bytes_copied;
+ int needs_cleanup;
+};
+
+struct orangefs_stats {
+ unsigned long cache_hits;
+ unsigned long cache_misses;
+ unsigned long reads;
+ unsigned long writes;
+};
+
+extern struct orangefs_stats g_orangefs_stats;
+
+/*
+ * NOTE: See Documentation/filesystems/porting for information
+ * on implementing FOO_I and properly accessing fs private data
+ */
+static inline struct orangefs_inode_s *ORANGEFS_I(struct inode *inode)
+{
+ return container_of(inode, struct orangefs_inode_s, vfs_inode);
+}
+
+static inline struct orangefs_sb_info_s *ORANGEFS_SB(struct super_block *sb)
+{
+ return (struct orangefs_sb_info_s *) sb->s_fs_info;
+}
+
+/* ino_t descends from "unsigned long", 8 bytes, 64 bits. */
+static inline ino_t orangefs_khandle_to_ino(struct orangefs_khandle *khandle)
+{
+ union {
+ unsigned char u[8];
+ __u64 ino;
+ } ihandle;
+
+ ihandle.u[0] = khandle->u[0] ^ khandle->u[4];
+ ihandle.u[1] = khandle->u[1] ^ khandle->u[5];
+ ihandle.u[2] = khandle->u[2] ^ khandle->u[6];
+ ihandle.u[3] = khandle->u[3] ^ khandle->u[7];
+ ihandle.u[4] = khandle->u[12] ^ khandle->u[8];
+ ihandle.u[5] = khandle->u[13] ^ khandle->u[9];
+ ihandle.u[6] = khandle->u[14] ^ khandle->u[10];
+ ihandle.u[7] = khandle->u[15] ^ khandle->u[11];
+
+ return ihandle.ino;
+}
+
+static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode)
+{
+ return &(ORANGEFS_I(inode)->refn.khandle);
+}
+
+static inline __s32 get_fsid_from_ino(struct inode *inode)
+{
+ return ORANGEFS_I(inode)->refn.fs_id;
+}
+
+static inline ino_t get_ino_from_khandle(struct inode *inode)
+{
+ struct orangefs_khandle *khandle;
+ ino_t ino;
+
+ khandle = get_khandle_from_ino(inode);
+ ino = orangefs_khandle_to_ino(khandle);
+ return ino;
+}
+
+static inline ino_t get_parent_ino_from_dentry(struct dentry *dentry)
+{
+ return get_ino_from_khandle(dentry->d_parent->d_inode);
+}
+
+static inline int is_root_handle(struct inode *inode)
+{
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s: root handle: %pU, this handle: %pU:\n",
+ __func__,
+ &ORANGEFS_SB(inode->i_sb)->root_khandle,
+ get_khandle_from_ino(inode));
+
+ if (ORANGEFS_khandle_cmp(&(ORANGEFS_SB(inode->i_sb)->root_khandle),
+ get_khandle_from_ino(inode)))
+ return 0;
+ else
+ return 1;
+}
+
+static inline int match_handle(struct orangefs_khandle resp_handle,
+ struct inode *inode)
+{
+ gossip_debug(GOSSIP_DCACHE_DEBUG,
+ "%s: one handle: %pU, another handle:%pU:\n",
+ __func__,
+ &resp_handle,
+ get_khandle_from_ino(inode));
+
+ if (ORANGEFS_khandle_cmp(&resp_handle, get_khandle_from_ino(inode)))
+ return 0;
+ else
+ return 1;
+}
+
+/*
+ * defined in orangefs-cache.c
+ */
+int op_cache_initialize(void);
+int op_cache_finalize(void);
+struct orangefs_kernel_op_s *op_alloc(__s32 type);
+void orangefs_new_tag(struct orangefs_kernel_op_s *op);
+char *get_opname_string(struct orangefs_kernel_op_s *new_op);
+
+int orangefs_inode_cache_initialize(void);
+int orangefs_inode_cache_finalize(void);
+
+/*
+ * defined in orangefs-mod.c
+ */
+void purge_inprogress_ops(void);
+
+/*
+ * defined in waitqueue.c
+ */
+void purge_waiting_ops(void);
+
+/*
+ * defined in super.c
+ */
+struct dentry *orangefs_mount(struct file_system_type *fst,
+ int flags,
+ const char *devname,
+ void *data);
+
+void orangefs_kill_sb(struct super_block *sb);
+int orangefs_remount(struct orangefs_sb_info_s *);
+
+int fsid_key_table_initialize(void);
+void fsid_key_table_finalize(void);
+
+/*
+ * defined in inode.c
+ */
+__u32 convert_to_orangefs_mask(unsigned long lite_mask);
+struct inode *orangefs_new_inode(struct super_block *sb,
+ struct inode *dir,
+ int mode,
+ dev_t dev,
+ struct orangefs_object_kref *ref);
+
+int orangefs_setattr(struct dentry *dentry, struct iattr *iattr);
+
+int orangefs_getattr(struct vfsmount *mnt,
+ struct dentry *dentry,
+ struct kstat *kstat);
+
+int orangefs_permission(struct inode *inode, int mask);
+
+/*
+ * defined in xattr.c
+ */
+int orangefs_setxattr(struct dentry *dentry,
+ const char *name,
+ const void *value,
+ size_t size,
+ int flags);
+
+ssize_t orangefs_getxattr(struct dentry *dentry,
+ const char *name,
+ void *buffer,
+ size_t size);
+
+ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/*
+ * defined in namei.c
+ */
+struct inode *orangefs_iget(struct super_block *sb,
+ struct orangefs_object_kref *ref);
+
+ssize_t orangefs_inode_read(struct inode *inode,
+ struct iov_iter *iter,
+ loff_t *offset,
+ loff_t readahead_size);
+
+/*
+ * defined in devorangefs-req.c
+ */
+int orangefs_dev_init(void);
+void orangefs_dev_cleanup(void);
+int is_daemon_in_service(void);
+bool __is_daemon_in_service(void);
+
+/*
+ * defined in orangefs-utils.c
+ */
+__s32 fsid_of_op(struct orangefs_kernel_op_s *op);
+
+int orangefs_flush_inode(struct inode *inode);
+
+ssize_t orangefs_inode_getxattr(struct inode *inode,
+ const char *prefix,
+ const char *name,
+ void *buffer,
+ size_t size);
+
+int orangefs_inode_setxattr(struct inode *inode,
+ const char *prefix,
+ const char *name,
+ const void *value,
+ size_t size,
+ int flags);
+
+int orangefs_inode_getattr(struct inode *inode, int new, int size);
+
+int orangefs_inode_check_changed(struct inode *inode);
+
+int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr);
+
+void orangefs_make_bad_inode(struct inode *inode);
+
+int orangefs_unmount_sb(struct super_block *sb);
+
+bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op);
+
+int orangefs_normalize_to_errno(__s32 error_code);
+
+extern struct mutex devreq_mutex;
+extern struct mutex request_mutex;
+extern int debug;
+extern int op_timeout_secs;
+extern int slot_timeout_secs;
+extern struct list_head orangefs_superblocks;
+extern spinlock_t orangefs_superblocks_lock;
+extern struct list_head orangefs_request_list;
+extern spinlock_t orangefs_request_list_lock;
+extern wait_queue_head_t orangefs_request_list_waitq;
+extern struct list_head *htable_ops_in_progress;
+extern spinlock_t htable_ops_in_progress_lock;
+extern int hash_table_size;
+
+extern const struct address_space_operations orangefs_address_operations;
+extern struct backing_dev_info orangefs_backing_dev_info;
+extern struct inode_operations orangefs_file_inode_operations;
+extern const struct file_operations orangefs_file_operations;
+extern struct inode_operations orangefs_symlink_inode_operations;
+extern struct inode_operations orangefs_dir_inode_operations;
+extern const struct file_operations orangefs_dir_operations;
+extern const struct dentry_operations orangefs_dentry_operations;
+extern const struct file_operations orangefs_devreq_file_operations;
+
+extern wait_queue_head_t orangefs_bufmap_init_waitq;
+
+/*
+ * misc convenience macros
+ */
+
+#define ORANGEFS_OP_INTERRUPTIBLE 1 /* service_operation() is interruptible */
+#define ORANGEFS_OP_PRIORITY 2 /* service_operation() is high priority */
+#define ORANGEFS_OP_CANCELLATION 4 /* this is a cancellation */
+#define ORANGEFS_OP_NO_MUTEX 8 /* don't acquire request_mutex */
+#define ORANGEFS_OP_ASYNC 16 /* Queue it, but don't wait */
+
+int service_operation(struct orangefs_kernel_op_s *op,
+ const char *op_name,
+ int flags);
+
+#define get_interruptible_flag(inode) \
+ ((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \
+ ORANGEFS_OP_INTERRUPTIBLE : 0)
+
+#define fill_default_sys_attrs(sys_attr, type, mode) \
+do { \
+ sys_attr.owner = from_kuid(current_user_ns(), current_fsuid()); \
+ sys_attr.group = from_kgid(current_user_ns(), current_fsgid()); \
+ sys_attr.perms = ORANGEFS_util_translate_mode(mode); \
+ sys_attr.mtime = 0; \
+ sys_attr.atime = 0; \
+ sys_attr.ctime = 0; \
+ sys_attr.mask = ORANGEFS_ATTR_SYS_ALL_SETABLE; \
+} while (0)
+
+static inline void orangefs_i_size_write(struct inode *inode, loff_t i_size)
+{
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+ mutex_lock(&inode->i_mutex);
+#endif
+ i_size_write(inode, i_size);
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+ mutex_unlock(&inode->i_mutex);
+#endif
+}
+
+#endif /* __ORANGEFSKERNEL_H */
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
new file mode 100644
index 000000000000..6f072a8c0de1
--- /dev/null
+++ b/fs/orangefs/orangefs-mod.c
@@ -0,0 +1,293 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * Changes by Acxiom Corporation to add proc file handler for pvfs2 client
+ * parameters, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-debugfs.h"
+#include "orangefs-sysfs.h"
+
+/* ORANGEFS_VERSION is a ./configure define */
+#ifndef ORANGEFS_VERSION
+#define ORANGEFS_VERSION "upstream"
+#endif
+
+/*
+ * global variables declared here
+ */
+
+/* array of client debug keyword/mask values */
+struct client_debug_mask *cdm_array;
+int cdm_element_count;
+
+char kernel_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN] = "none";
+char client_debug_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+char client_debug_array_string[ORANGEFS_MAX_DEBUG_STRING_LEN];
+
+char *debug_help_string;
+int help_string_initialized;
+struct dentry *help_file_dentry;
+struct dentry *client_debug_dentry;
+struct dentry *debug_dir;
+int client_verbose_index;
+int client_all_index;
+struct orangefs_stats g_orangefs_stats;
+
+/* the size of the hash tables for ops in progress */
+int hash_table_size = 509;
+
+static ulong module_parm_debug_mask;
+__u64 gossip_debug_mask;
+struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
+unsigned int kernel_mask_set_mod_init; /* implicitly false */
+int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
+int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("ORANGEFS Development Team");
+MODULE_DESCRIPTION("The Linux Kernel VFS interface to ORANGEFS");
+MODULE_PARM_DESC(module_parm_debug_mask, "debugging level (see orangefs-debug.h for values)");
+MODULE_PARM_DESC(op_timeout_secs, "Operation timeout in seconds");
+MODULE_PARM_DESC(slot_timeout_secs, "Slot timeout in seconds");
+MODULE_PARM_DESC(hash_table_size,
+ "size of hash table for operations in progress");
+
+static struct file_system_type orangefs_fs_type = {
+ .name = "pvfs2",
+ .mount = orangefs_mount,
+ .kill_sb = orangefs_kill_sb,
+ .owner = THIS_MODULE,
+};
+
+module_param(hash_table_size, int, 0);
+module_param(module_parm_debug_mask, ulong, 0644);
+module_param(op_timeout_secs, int, 0);
+module_param(slot_timeout_secs, int, 0);
+
+/* synchronizes the request device file */
+DEFINE_MUTEX(devreq_mutex);
+
+/*
+ * Blocks non-priority requests from being queued for servicing. This
+ * could be used for protecting the request list data structure, but
+ * for now it's only being used to stall the op addition to the request
+ * list
+ */
+DEFINE_MUTEX(request_mutex);
+
+/* hash table for storing operations waiting for matching downcall */
+struct list_head *htable_ops_in_progress;
+DEFINE_SPINLOCK(htable_ops_in_progress_lock);
+
+/* list for queueing upcall operations */
+LIST_HEAD(orangefs_request_list);
+
+/* used to protect the above orangefs_request_list */
+DEFINE_SPINLOCK(orangefs_request_list_lock);
+
+/* used for incoming request notification */
+DECLARE_WAIT_QUEUE_HEAD(orangefs_request_list_waitq);
+
+static int __init orangefs_init(void)
+{
+ int ret = -1;
+ __u32 i = 0;
+
+ /* convert input debug mask to a 64-bit unsigned integer */
+ gossip_debug_mask = (unsigned long long) module_parm_debug_mask;
+
+ /*
+ * set the kernel's gossip debug string; invalid mask values will
+ * be ignored.
+ */
+ debug_mask_to_string(&gossip_debug_mask, 0);
+
+ /* remove any invalid values from the mask */
+ debug_string_to_mask(kernel_debug_string, &gossip_debug_mask, 0);
+
+ /*
+ * if the mask has a non-zero value, then indicate that the mask
+ * was set when the kernel module was loaded. The orangefs dev ioctl
+ * command will look at this boolean to determine if the kernel's
+ * debug mask should be overwritten when the client-core is started.
+ */
+ if (gossip_debug_mask != 0)
+ kernel_mask_set_mod_init = true;
+
+ pr_info("%s: called with debug mask: :%s: :%llx:\n",
+ __func__,
+ kernel_debug_string,
+ (unsigned long long)gossip_debug_mask);
+
+ ret = bdi_init(&orangefs_backing_dev_info);
+
+ if (ret)
+ return ret;
+
+ if (op_timeout_secs < 0)
+ op_timeout_secs = 0;
+
+ if (slot_timeout_secs < 0)
+ slot_timeout_secs = 0;
+
+ /* initialize global book keeping data structures */
+ ret = op_cache_initialize();
+ if (ret < 0)
+ goto err;
+
+ ret = orangefs_inode_cache_initialize();
+ if (ret < 0)
+ goto cleanup_op;
+
+ htable_ops_in_progress =
+ kcalloc(hash_table_size, sizeof(struct list_head), GFP_KERNEL);
+ if (!htable_ops_in_progress) {
+ gossip_err("Failed to initialize op hashtable");
+ ret = -ENOMEM;
+ goto cleanup_inode;
+ }
+
+ /* initialize a doubly linked at each hash table index */
+ for (i = 0; i < hash_table_size; i++)
+ INIT_LIST_HEAD(&htable_ops_in_progress[i]);
+
+ ret = fsid_key_table_initialize();
+ if (ret < 0)
+ goto cleanup_progress_table;
+
+ /*
+ * Build the contents of /sys/kernel/debug/orangefs/debug-help
+ * from the keywords in the kernel keyword/mask array.
+ *
+ * The keywords in the client keyword/mask array are
+ * unknown at boot time.
+ *
+ * orangefs_prepare_debugfs_help_string will be used again
+ * later to rebuild the debug-help file after the client starts
+ * and passes along the needed info. The argument signifies
+ * which time orangefs_prepare_debugfs_help_string is being
+ * called.
+ */
+ ret = orangefs_prepare_debugfs_help_string(1);
+ if (ret)
+ goto cleanup_key_table;
+
+ ret = orangefs_debugfs_init();
+ if (ret)
+ goto debugfs_init_failed;
+
+ ret = orangefs_kernel_debug_init();
+ if (ret)
+ goto kernel_debug_init_failed;
+
+ ret = orangefs_sysfs_init();
+ if (ret)
+ goto sysfs_init_failed;
+
+ /* Initialize the orangefsdev subsystem. */
+ ret = orangefs_dev_init();
+ if (ret < 0) {
+ gossip_err("%s: could not initialize device subsystem %d!\n",
+ __func__,
+ ret);
+ goto cleanup_device;
+ }
+
+ ret = register_filesystem(&orangefs_fs_type);
+ if (ret == 0) {
+ pr_info("orangefs: module version %s loaded\n", ORANGEFS_VERSION);
+ ret = 0;
+ goto out;
+ }
+
+ orangefs_sysfs_exit();
+
+cleanup_device:
+ orangefs_dev_cleanup();
+
+sysfs_init_failed:
+
+kernel_debug_init_failed:
+
+debugfs_init_failed:
+ orangefs_debugfs_cleanup();
+
+cleanup_key_table:
+ fsid_key_table_finalize();
+
+cleanup_progress_table:
+ kfree(htable_ops_in_progress);
+
+cleanup_inode:
+ orangefs_inode_cache_finalize();
+
+cleanup_op:
+ op_cache_finalize();
+
+err:
+ bdi_destroy(&orangefs_backing_dev_info);
+
+out:
+ return ret;
+}
+
+static void __exit orangefs_exit(void)
+{
+ int i = 0;
+ gossip_debug(GOSSIP_INIT_DEBUG, "orangefs: orangefs_exit called\n");
+
+ unregister_filesystem(&orangefs_fs_type);
+ orangefs_debugfs_cleanup();
+ orangefs_sysfs_exit();
+ fsid_key_table_finalize();
+ orangefs_dev_cleanup();
+ BUG_ON(!list_empty(&orangefs_request_list));
+ for (i = 0; i < hash_table_size; i++)
+ BUG_ON(!list_empty(&htable_ops_in_progress[i]));
+
+ orangefs_inode_cache_finalize();
+ op_cache_finalize();
+
+ kfree(htable_ops_in_progress);
+
+ bdi_destroy(&orangefs_backing_dev_info);
+
+ pr_info("orangefs: module version %s unloaded\n", ORANGEFS_VERSION);
+}
+
+/*
+ * What we do in this function is to walk the list of operations
+ * that are in progress in the hash table and mark them as purged as well.
+ */
+void purge_inprogress_ops(void)
+{
+ int i;
+
+ for (i = 0; i < hash_table_size; i++) {
+ struct orangefs_kernel_op_s *op;
+ struct orangefs_kernel_op_s *next;
+
+ spin_lock(&htable_ops_in_progress_lock);
+ list_for_each_entry_safe(op,
+ next,
+ &htable_ops_in_progress[i],
+ list) {
+ set_op_state_purged(op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(op),
+ op->op_state,
+ current->comm);
+ }
+ spin_unlock(&htable_ops_in_progress_lock);
+ }
+}
+
+module_init(orangefs_init);
+module_exit(orangefs_exit);
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
new file mode 100644
index 000000000000..5c03113e3ad2
--- /dev/null
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -0,0 +1,1772 @@
+/*
+ * Documentation/ABI/stable/orangefs-sysfs:
+ *
+ * What: /sys/fs/orangefs/perf_counter_reset
+ * Date: June 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * echo a 0 or a 1 into perf_counter_reset to
+ * reset all the counters in
+ * /sys/fs/orangefs/perf_counters
+ * except ones with PINT_PERF_PRESERVE set.
+ *
+ *
+ * What: /sys/fs/orangefs/perf_counters/...
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Counters and settings for various caches.
+ * Read only.
+ *
+ *
+ * What: /sys/fs/orangefs/perf_time_interval_secs
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Length of perf counter intervals in
+ * seconds.
+ *
+ *
+ * What: /sys/fs/orangefs/perf_history_size
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * The perf_counters cache statistics have N, or
+ * perf_history_size, samples. The default is
+ * one.
+ *
+ * Every perf_time_interval_secs the (first)
+ * samples are reset.
+ *
+ * If N is greater than one, the "current" set
+ * of samples is reset, and the samples from the
+ * other N-1 intervals remain available.
+ *
+ *
+ * What: /sys/fs/orangefs/op_timeout_secs
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Service operation timeout in seconds.
+ *
+ *
+ * What: /sys/fs/orangefs/slot_timeout_secs
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * "Slot" timeout in seconds. A "slot"
+ * is an indexed buffer in the shared
+ * memory segment used for communication
+ * between the kernel module and userspace.
+ * Slots are requested and waited for,
+ * the wait times out after slot_timeout_secs.
+ *
+ *
+ * What: /sys/fs/orangefs/acache/...
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Attribute cache configurable settings.
+ *
+ *
+ * What: /sys/fs/orangefs/ncache/...
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Name cache configurable settings.
+ *
+ *
+ * What: /sys/fs/orangefs/capcache/...
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Capability cache configurable settings.
+ *
+ *
+ * What: /sys/fs/orangefs/ccache/...
+ * Date: Jun 2015
+ * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Description:
+ * Credential cache configurable settings.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-sysfs.h"
+
+#define ORANGEFS_KOBJ_ID "orangefs"
+#define ACACHE_KOBJ_ID "acache"
+#define CAPCACHE_KOBJ_ID "capcache"
+#define CCACHE_KOBJ_ID "ccache"
+#define NCACHE_KOBJ_ID "ncache"
+#define PC_KOBJ_ID "pc"
+#define STATS_KOBJ_ID "stats"
+
+struct orangefs_obj {
+ struct kobject kobj;
+ int op_timeout_secs;
+ int perf_counter_reset;
+ int perf_history_size;
+ int perf_time_interval_secs;
+ int slot_timeout_secs;
+};
+
+struct acache_orangefs_obj {
+ struct kobject kobj;
+ int hard_limit;
+ int reclaim_percentage;
+ int soft_limit;
+ int timeout_msecs;
+};
+
+struct capcache_orangefs_obj {
+ struct kobject kobj;
+ int hard_limit;
+ int reclaim_percentage;
+ int soft_limit;
+ int timeout_secs;
+};
+
+struct ccache_orangefs_obj {
+ struct kobject kobj;
+ int hard_limit;
+ int reclaim_percentage;
+ int soft_limit;
+ int timeout_secs;
+};
+
+struct ncache_orangefs_obj {
+ struct kobject kobj;
+ int hard_limit;
+ int reclaim_percentage;
+ int soft_limit;
+ int timeout_msecs;
+};
+
+struct pc_orangefs_obj {
+ struct kobject kobj;
+ char *acache;
+ char *capcache;
+ char *ncache;
+};
+
+struct stats_orangefs_obj {
+ struct kobject kobj;
+ int reads;
+ int writes;
+};
+
+struct orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct acache_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct acache_orangefs_obj *acache_orangefs_obj,
+ struct acache_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct acache_orangefs_obj *acache_orangefs_obj,
+ struct acache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct capcache_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct capcache_orangefs_obj *capcache_orangefs_obj,
+ struct capcache_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct capcache_orangefs_obj *capcache_orangefs_obj,
+ struct capcache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct ccache_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct ccache_orangefs_obj *ccache_orangefs_obj,
+ struct ccache_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct ccache_orangefs_obj *ccache_orangefs_obj,
+ struct ccache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct ncache_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct ncache_orangefs_obj *ncache_orangefs_obj,
+ struct ncache_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct ncache_orangefs_obj *ncache_orangefs_obj,
+ struct ncache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct pc_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct pc_orangefs_obj *pc_orangefs_obj,
+ struct pc_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct pc_orangefs_obj *pc_orangefs_obj,
+ struct pc_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+struct stats_orangefs_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct stats_orangefs_obj *stats_orangefs_obj,
+ struct stats_orangefs_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct stats_orangefs_obj *stats_orangefs_obj,
+ struct stats_orangefs_attribute *attr,
+ const char *buf,
+ size_t count);
+};
+
+static ssize_t orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct orangefs_attribute *attribute;
+ struct orangefs_obj *orangefs_obj;
+ int rc;
+
+ attribute = container_of(attr, struct orangefs_attribute, attr);
+ orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static ssize_t orangefs_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct orangefs_attribute *attribute;
+ struct orangefs_obj *orangefs_obj;
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "orangefs_attr_store: start\n");
+
+ attribute = container_of(attr, struct orangefs_attribute, attr);
+ orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
+
+ if (!attribute->store) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->store(orangefs_obj, attribute, buf, len);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops orangefs_sysfs_ops = {
+ .show = orangefs_attr_show,
+ .store = orangefs_attr_store,
+};
+
+static ssize_t acache_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct acache_orangefs_attribute *attribute;
+ struct acache_orangefs_obj *acache_orangefs_obj;
+ int rc;
+
+ attribute = container_of(attr, struct acache_orangefs_attribute, attr);
+ acache_orangefs_obj =
+ container_of(kobj, struct acache_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(acache_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static ssize_t acache_orangefs_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct acache_orangefs_attribute *attribute;
+ struct acache_orangefs_obj *acache_orangefs_obj;
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "acache_orangefs_attr_store: start\n");
+
+ attribute = container_of(attr, struct acache_orangefs_attribute, attr);
+ acache_orangefs_obj =
+ container_of(kobj, struct acache_orangefs_obj, kobj);
+
+ if (!attribute->store) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->store(acache_orangefs_obj, attribute, buf, len);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops acache_orangefs_sysfs_ops = {
+ .show = acache_orangefs_attr_show,
+ .store = acache_orangefs_attr_store,
+};
+
+static ssize_t capcache_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct capcache_orangefs_attribute *attribute;
+ struct capcache_orangefs_obj *capcache_orangefs_obj;
+ int rc;
+
+ attribute =
+ container_of(attr, struct capcache_orangefs_attribute, attr);
+ capcache_orangefs_obj =
+ container_of(kobj, struct capcache_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(capcache_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static ssize_t capcache_orangefs_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct capcache_orangefs_attribute *attribute;
+ struct capcache_orangefs_obj *capcache_orangefs_obj;
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "capcache_orangefs_attr_store: start\n");
+
+ attribute =
+ container_of(attr, struct capcache_orangefs_attribute, attr);
+ capcache_orangefs_obj =
+ container_of(kobj, struct capcache_orangefs_obj, kobj);
+
+ if (!attribute->store) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->store(capcache_orangefs_obj, attribute, buf, len);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops capcache_orangefs_sysfs_ops = {
+ .show = capcache_orangefs_attr_show,
+ .store = capcache_orangefs_attr_store,
+};
+
+static ssize_t ccache_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct ccache_orangefs_attribute *attribute;
+ struct ccache_orangefs_obj *ccache_orangefs_obj;
+ int rc;
+
+ attribute =
+ container_of(attr, struct ccache_orangefs_attribute, attr);
+ ccache_orangefs_obj =
+ container_of(kobj, struct ccache_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(ccache_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static ssize_t ccache_orangefs_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct ccache_orangefs_attribute *attribute;
+ struct ccache_orangefs_obj *ccache_orangefs_obj;
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "ccache_orangefs_attr_store: start\n");
+
+ attribute =
+ container_of(attr, struct ccache_orangefs_attribute, attr);
+ ccache_orangefs_obj =
+ container_of(kobj, struct ccache_orangefs_obj, kobj);
+
+ if (!attribute->store) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->store(ccache_orangefs_obj, attribute, buf, len);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops ccache_orangefs_sysfs_ops = {
+ .show = ccache_orangefs_attr_show,
+ .store = ccache_orangefs_attr_store,
+};
+
+static ssize_t ncache_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct ncache_orangefs_attribute *attribute;
+ struct ncache_orangefs_obj *ncache_orangefs_obj;
+ int rc;
+
+ attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
+ ncache_orangefs_obj =
+ container_of(kobj, struct ncache_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(ncache_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static ssize_t ncache_orangefs_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf,
+ size_t len)
+{
+ struct ncache_orangefs_attribute *attribute;
+ struct ncache_orangefs_obj *ncache_orangefs_obj;
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "ncache_orangefs_attr_store: start\n");
+
+ attribute = container_of(attr, struct ncache_orangefs_attribute, attr);
+ ncache_orangefs_obj =
+ container_of(kobj, struct ncache_orangefs_obj, kobj);
+
+ if (!attribute->store) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->store(ncache_orangefs_obj, attribute, buf, len);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops ncache_orangefs_sysfs_ops = {
+ .show = ncache_orangefs_attr_show,
+ .store = ncache_orangefs_attr_store,
+};
+
+static ssize_t pc_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct pc_orangefs_attribute *attribute;
+ struct pc_orangefs_obj *pc_orangefs_obj;
+ int rc;
+
+ attribute = container_of(attr, struct pc_orangefs_attribute, attr);
+ pc_orangefs_obj =
+ container_of(kobj, struct pc_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(pc_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops pc_orangefs_sysfs_ops = {
+ .show = pc_orangefs_attr_show,
+};
+
+static ssize_t stats_orangefs_attr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct stats_orangefs_attribute *attribute;
+ struct stats_orangefs_obj *stats_orangefs_obj;
+ int rc;
+
+ attribute = container_of(attr, struct stats_orangefs_attribute, attr);
+ stats_orangefs_obj =
+ container_of(kobj, struct stats_orangefs_obj, kobj);
+
+ if (!attribute->show) {
+ rc = -EIO;
+ goto out;
+ }
+
+ rc = attribute->show(stats_orangefs_obj, attribute, buf);
+
+out:
+ return rc;
+}
+
+static const struct sysfs_ops stats_orangefs_sysfs_ops = {
+ .show = stats_orangefs_attr_show,
+};
+
+static void orangefs_release(struct kobject *kobj)
+{
+ struct orangefs_obj *orangefs_obj;
+
+ orangefs_obj = container_of(kobj, struct orangefs_obj, kobj);
+ kfree(orangefs_obj);
+}
+
+static void acache_orangefs_release(struct kobject *kobj)
+{
+ struct acache_orangefs_obj *acache_orangefs_obj;
+
+ acache_orangefs_obj =
+ container_of(kobj, struct acache_orangefs_obj, kobj);
+ kfree(acache_orangefs_obj);
+}
+
+static void capcache_orangefs_release(struct kobject *kobj)
+{
+ struct capcache_orangefs_obj *capcache_orangefs_obj;
+
+ capcache_orangefs_obj =
+ container_of(kobj, struct capcache_orangefs_obj, kobj);
+ kfree(capcache_orangefs_obj);
+}
+
+static void ccache_orangefs_release(struct kobject *kobj)
+{
+ struct ccache_orangefs_obj *ccache_orangefs_obj;
+
+ ccache_orangefs_obj =
+ container_of(kobj, struct ccache_orangefs_obj, kobj);
+ kfree(ccache_orangefs_obj);
+}
+
+static void ncache_orangefs_release(struct kobject *kobj)
+{
+ struct ncache_orangefs_obj *ncache_orangefs_obj;
+
+ ncache_orangefs_obj =
+ container_of(kobj, struct ncache_orangefs_obj, kobj);
+ kfree(ncache_orangefs_obj);
+}
+
+static void pc_orangefs_release(struct kobject *kobj)
+{
+ struct pc_orangefs_obj *pc_orangefs_obj;
+
+ pc_orangefs_obj =
+ container_of(kobj, struct pc_orangefs_obj, kobj);
+ kfree(pc_orangefs_obj);
+}
+
+static void stats_orangefs_release(struct kobject *kobj)
+{
+ struct stats_orangefs_obj *stats_orangefs_obj;
+
+ stats_orangefs_obj =
+ container_of(kobj, struct stats_orangefs_obj, kobj);
+ kfree(stats_orangefs_obj);
+}
+
+static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
+{
+ int rc = -EIO;
+ struct orangefs_attribute *orangefs_attr;
+ struct stats_orangefs_attribute *stats_orangefs_attr;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG, "sysfs_int_show: id:%s:\n", kobj_id);
+
+ if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
+ orangefs_attr = (struct orangefs_attribute *)attr;
+
+ if (!strcmp(orangefs_attr->attr.name, "op_timeout_secs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ op_timeout_secs);
+ goto out;
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "slot_timeout_secs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ slot_timeout_secs);
+ goto out;
+ } else {
+ goto out;
+ }
+
+ } else if (!strcmp(kobj_id, STATS_KOBJ_ID)) {
+ stats_orangefs_attr = (struct stats_orangefs_attribute *)attr;
+
+ if (!strcmp(stats_orangefs_attr->attr.name, "reads")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%lu\n",
+ g_orangefs_stats.reads);
+ goto out;
+ } else if (!strcmp(stats_orangefs_attr->attr.name, "writes")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%lu\n",
+ g_orangefs_stats.writes);
+ goto out;
+ } else {
+ goto out;
+ }
+ }
+
+out:
+
+ return rc;
+}
+
+static ssize_t int_orangefs_show(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ char *buf)
+{
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "int_orangefs_show:start attr->attr.name:%s:\n",
+ attr->attr.name);
+
+ rc = sysfs_int_show(ORANGEFS_KOBJ_ID, buf, (void *) attr);
+
+ return rc;
+}
+
+static ssize_t int_stats_show(struct stats_orangefs_obj *stats_orangefs_obj,
+ struct stats_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "int_stats_show:start attr->attr.name:%s:\n",
+ attr->attr.name);
+
+ rc = sysfs_int_show(STATS_KOBJ_ID, buf, (void *) attr);
+
+ return rc;
+}
+
+static ssize_t int_store(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "int_store: start attr->attr.name:%s: buf:%s:\n",
+ attr->attr.name, buf);
+
+ if (!strcmp(attr->attr.name, "op_timeout_secs")) {
+ rc = kstrtoint(buf, 0, &op_timeout_secs);
+ goto out;
+ } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
+ rc = kstrtoint(buf, 0, &slot_timeout_secs);
+ goto out;
+ } else {
+ goto out;
+ }
+
+out:
+ if (rc)
+ rc = -EINVAL;
+ else
+ rc = count;
+
+ return rc;
+}
+
+/*
+ * obtain attribute values from userspace with a service operation.
+ */
+static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
+{
+ struct orangefs_kernel_op_s *new_op = NULL;
+ int rc = 0;
+ char *ser_op_type = NULL;
+ struct orangefs_attribute *orangefs_attr;
+ struct acache_orangefs_attribute *acache_attr;
+ struct capcache_orangefs_attribute *capcache_attr;
+ struct ccache_orangefs_attribute *ccache_attr;
+ struct ncache_orangefs_attribute *ncache_attr;
+ struct pc_orangefs_attribute *pc_attr;
+ __u32 op_alloc_type;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "sysfs_service_op_show: id:%s:\n",
+ kobj_id);
+
+ if (strcmp(kobj_id, PC_KOBJ_ID))
+ op_alloc_type = ORANGEFS_VFS_OP_PARAM;
+ else
+ op_alloc_type = ORANGEFS_VFS_OP_PERF_COUNT;
+
+ new_op = op_alloc(op_alloc_type);
+ if (!new_op)
+ return -ENOMEM;
+
+ /* Can't do a service_operation if the client is not running... */
+ rc = is_daemon_in_service();
+ if (rc) {
+ pr_info("%s: Client not running :%d:\n",
+ __func__,
+ is_daemon_in_service());
+ goto out;
+ }
+
+ if (strcmp(kobj_id, PC_KOBJ_ID))
+ new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_GET;
+
+ if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
+ orangefs_attr = (struct orangefs_attribute *)attr;
+
+ if (!strcmp(orangefs_attr->attr.name, "perf_history_size"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
+ else if (!strcmp(orangefs_attr->attr.name,
+ "perf_time_interval_secs"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
+ else if (!strcmp(orangefs_attr->attr.name,
+ "perf_counter_reset"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
+
+ } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
+ acache_attr = (struct acache_orangefs_attribute *)attr;
+
+ if (!strcmp(acache_attr->attr.name, "timeout_msecs"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
+
+ if (!strcmp(acache_attr->attr.name, "hard_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
+
+ if (!strcmp(acache_attr->attr.name, "soft_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
+
+ if (!strcmp(acache_attr->attr.name, "reclaim_percentage"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
+
+ } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
+ capcache_attr = (struct capcache_orangefs_attribute *)attr;
+
+ if (!strcmp(capcache_attr->attr.name, "timeout_secs"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
+
+ if (!strcmp(capcache_attr->attr.name, "hard_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
+
+ if (!strcmp(capcache_attr->attr.name, "soft_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
+
+ if (!strcmp(capcache_attr->attr.name, "reclaim_percentage"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
+
+ } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
+ ccache_attr = (struct ccache_orangefs_attribute *)attr;
+
+ if (!strcmp(ccache_attr->attr.name, "timeout_secs"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
+
+ if (!strcmp(ccache_attr->attr.name, "hard_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
+
+ if (!strcmp(ccache_attr->attr.name, "soft_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
+
+ if (!strcmp(ccache_attr->attr.name, "reclaim_percentage"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
+
+ } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
+ ncache_attr = (struct ncache_orangefs_attribute *)attr;
+
+ if (!strcmp(ncache_attr->attr.name, "timeout_msecs"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
+
+ if (!strcmp(ncache_attr->attr.name, "hard_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
+
+ if (!strcmp(ncache_attr->attr.name, "soft_limit"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
+
+ if (!strcmp(ncache_attr->attr.name, "reclaim_percentage"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
+
+ } else if (!strcmp(kobj_id, PC_KOBJ_ID)) {
+ pc_attr = (struct pc_orangefs_attribute *)attr;
+
+ if (!strcmp(pc_attr->attr.name, ACACHE_KOBJ_ID))
+ new_op->upcall.req.perf_count.type =
+ ORANGEFS_PERF_COUNT_REQUEST_ACACHE;
+
+ if (!strcmp(pc_attr->attr.name, CAPCACHE_KOBJ_ID))
+ new_op->upcall.req.perf_count.type =
+ ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE;
+
+ if (!strcmp(pc_attr->attr.name, NCACHE_KOBJ_ID))
+ new_op->upcall.req.perf_count.type =
+ ORANGEFS_PERF_COUNT_REQUEST_NCACHE;
+
+ } else {
+ gossip_err("sysfs_service_op_show: unknown kobj_id:%s:\n",
+ kobj_id);
+ rc = -EINVAL;
+ goto out;
+ }
+
+
+ if (strcmp(kobj_id, PC_KOBJ_ID))
+ ser_op_type = "orangefs_param";
+ else
+ ser_op_type = "orangefs_perf_count";
+
+ /*
+ * The service_operation will return an errno return code on
+ * error, and zero on success.
+ */
+ rc = service_operation(new_op, ser_op_type, ORANGEFS_OP_INTERRUPTIBLE);
+
+out:
+ if (!rc) {
+ if (strcmp(kobj_id, PC_KOBJ_ID)) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ (int)new_op->downcall.resp.param.value);
+ } else {
+ rc = scnprintf(
+ buf,
+ PAGE_SIZE,
+ "%s",
+ new_op->downcall.resp.perf_count.buffer);
+ }
+ }
+
+ op_release(new_op);
+
+ return rc;
+
+}
+
+static ssize_t service_orangefs_show(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(ORANGEFS_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+static ssize_t
+ service_acache_show(struct acache_orangefs_obj *acache_orangefs_obj,
+ struct acache_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(ACACHE_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+static ssize_t service_capcache_show(struct capcache_orangefs_obj
+ *capcache_orangefs_obj,
+ struct capcache_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(CAPCACHE_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+static ssize_t service_ccache_show(struct ccache_orangefs_obj
+ *ccache_orangefs_obj,
+ struct ccache_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(CCACHE_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+static ssize_t
+ service_ncache_show(struct ncache_orangefs_obj *ncache_orangefs_obj,
+ struct ncache_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(NCACHE_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+static ssize_t
+ service_pc_show(struct pc_orangefs_obj *pc_orangefs_obj,
+ struct pc_orangefs_attribute *attr,
+ char *buf)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_show(PC_KOBJ_ID, buf, (void *)attr);
+
+ return rc;
+}
+
+/*
+ * pass attribute values back to userspace with a service operation.
+ *
+ * We have to do a memory allocation, an sscanf and a service operation.
+ * And we have to evaluate what the user entered, to make sure the
+ * value is within the range supported by the attribute. So, there's
+ * a lot of return code checking and mapping going on here.
+ *
+ * We want to return 1 if we think everything went OK, and
+ * EINVAL if not.
+ */
+static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
+{
+ struct orangefs_kernel_op_s *new_op = NULL;
+ int val = 0;
+ int rc = 0;
+ struct orangefs_attribute *orangefs_attr;
+ struct acache_orangefs_attribute *acache_attr;
+ struct capcache_orangefs_attribute *capcache_attr;
+ struct ccache_orangefs_attribute *ccache_attr;
+ struct ncache_orangefs_attribute *ncache_attr;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG,
+ "sysfs_service_op_store: id:%s:\n",
+ kobj_id);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_PARAM);
+ if (!new_op)
+ return -EINVAL; /* sic */
+
+ /* Can't do a service_operation if the client is not running... */
+ rc = is_daemon_in_service();
+ if (rc) {
+ pr_info("%s: Client not running :%d:\n",
+ __func__,
+ is_daemon_in_service());
+ goto out;
+ }
+
+ /*
+ * The value we want to send back to userspace is in buf.
+ */
+ rc = kstrtoint(buf, 0, &val);
+ if (rc)
+ goto out;
+
+ if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
+ orangefs_attr = (struct orangefs_attribute *)attr;
+
+ if (!strcmp(orangefs_attr->attr.name, "perf_history_size")) {
+ if (val > 0) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "perf_time_interval_secs")) {
+ if (val > 0) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "perf_counter_reset")) {
+ if ((val == 0) || (val == 1)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ } else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
+ acache_attr = (struct acache_orangefs_attribute *)attr;
+
+ if (!strcmp(acache_attr->attr.name, "hard_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(acache_attr->attr.name, "soft_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(acache_attr->attr.name,
+ "reclaim_percentage")) {
+ if ((val > -1) && (val < 101)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(acache_attr->attr.name, "timeout_msecs")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ } else if (!strcmp(kobj_id, CAPCACHE_KOBJ_ID)) {
+ capcache_attr = (struct capcache_orangefs_attribute *)attr;
+
+ if (!strcmp(capcache_attr->attr.name, "hard_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(capcache_attr->attr.name, "soft_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(capcache_attr->attr.name,
+ "reclaim_percentage")) {
+ if ((val > -1) && (val < 101)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(capcache_attr->attr.name, "timeout_secs")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ } else if (!strcmp(kobj_id, CCACHE_KOBJ_ID)) {
+ ccache_attr = (struct ccache_orangefs_attribute *)attr;
+
+ if (!strcmp(ccache_attr->attr.name, "hard_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ccache_attr->attr.name, "soft_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ccache_attr->attr.name,
+ "reclaim_percentage")) {
+ if ((val > -1) && (val < 101)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ccache_attr->attr.name, "timeout_secs")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ } else if (!strcmp(kobj_id, NCACHE_KOBJ_ID)) {
+ ncache_attr = (struct ncache_orangefs_attribute *)attr;
+
+ if (!strcmp(ncache_attr->attr.name, "hard_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ncache_attr->attr.name, "soft_limit")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ncache_attr->attr.name,
+ "reclaim_percentage")) {
+ if ((val > -1) && (val < 101)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(ncache_attr->attr.name, "timeout_msecs")) {
+ if (val > -1) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ }
+
+ } else {
+ gossip_err("sysfs_service_op_store: unknown kobj_id:%s:\n",
+ kobj_id);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
+
+ new_op->upcall.req.param.value = val;
+
+ /*
+ * The service_operation will return a errno return code on
+ * error, and zero on success.
+ */
+ rc = service_operation(new_op, "orangefs_param", ORANGEFS_OP_INTERRUPTIBLE);
+
+ if (rc < 0) {
+ gossip_err("sysfs_service_op_store: service op returned:%d:\n",
+ rc);
+ rc = 0;
+ } else {
+ rc = 1;
+ }
+
+out:
+ op_release(new_op);
+
+ if (rc == -ENOMEM || rc == 0)
+ rc = -EINVAL;
+
+ return rc;
+}
+
+static ssize_t
+ service_orangefs_store(struct orangefs_obj *orangefs_obj,
+ struct orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_store(ORANGEFS_KOBJ_ID, buf, (void *) attr);
+
+ /* rc should have an errno value if the service_op went bad. */
+ if (rc == 1)
+ rc = count;
+
+ return rc;
+}
+
+static ssize_t
+ service_acache_store(struct acache_orangefs_obj *acache_orangefs_obj,
+ struct acache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_store(ACACHE_KOBJ_ID, buf, (void *) attr);
+
+ /* rc should have an errno value if the service_op went bad. */
+ if (rc == 1)
+ rc = count;
+
+ return rc;
+}
+
+static ssize_t
+ service_capcache_store(struct capcache_orangefs_obj
+ *capcache_orangefs_obj,
+ struct capcache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_store(CAPCACHE_KOBJ_ID, buf, (void *) attr);
+
+ /* rc should have an errno value if the service_op went bad. */
+ if (rc == 1)
+ rc = count;
+
+ return rc;
+}
+
+static ssize_t service_ccache_store(struct ccache_orangefs_obj
+ *ccache_orangefs_obj,
+ struct ccache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_store(CCACHE_KOBJ_ID, buf, (void *) attr);
+
+ /* rc should have an errno value if the service_op went bad. */
+ if (rc == 1)
+ rc = count;
+
+ return rc;
+}
+
+static ssize_t
+ service_ncache_store(struct ncache_orangefs_obj *ncache_orangefs_obj,
+ struct ncache_orangefs_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ int rc = 0;
+
+ rc = sysfs_service_op_store(NCACHE_KOBJ_ID, buf, (void *) attr);
+
+ /* rc should have an errno value if the service_op went bad. */
+ if (rc == 1)
+ rc = count;
+
+ return rc;
+}
+
+static struct orangefs_attribute op_timeout_secs_attribute =
+ __ATTR(op_timeout_secs, 0664, int_orangefs_show, int_store);
+
+static struct orangefs_attribute slot_timeout_secs_attribute =
+ __ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
+
+static struct orangefs_attribute perf_counter_reset_attribute =
+ __ATTR(perf_counter_reset,
+ 0664,
+ service_orangefs_show,
+ service_orangefs_store);
+
+static struct orangefs_attribute perf_history_size_attribute =
+ __ATTR(perf_history_size,
+ 0664,
+ service_orangefs_show,
+ service_orangefs_store);
+
+static struct orangefs_attribute perf_time_interval_secs_attribute =
+ __ATTR(perf_time_interval_secs,
+ 0664,
+ service_orangefs_show,
+ service_orangefs_store);
+
+static struct attribute *orangefs_default_attrs[] = {
+ &op_timeout_secs_attribute.attr,
+ &slot_timeout_secs_attribute.attr,
+ &perf_counter_reset_attribute.attr,
+ &perf_history_size_attribute.attr,
+ &perf_time_interval_secs_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type orangefs_ktype = {
+ .sysfs_ops = &orangefs_sysfs_ops,
+ .release = orangefs_release,
+ .default_attrs = orangefs_default_attrs,
+};
+
+static struct acache_orangefs_attribute acache_hard_limit_attribute =
+ __ATTR(hard_limit,
+ 0664,
+ service_acache_show,
+ service_acache_store);
+
+static struct acache_orangefs_attribute acache_reclaim_percent_attribute =
+ __ATTR(reclaim_percentage,
+ 0664,
+ service_acache_show,
+ service_acache_store);
+
+static struct acache_orangefs_attribute acache_soft_limit_attribute =
+ __ATTR(soft_limit,
+ 0664,
+ service_acache_show,
+ service_acache_store);
+
+static struct acache_orangefs_attribute acache_timeout_msecs_attribute =
+ __ATTR(timeout_msecs,
+ 0664,
+ service_acache_show,
+ service_acache_store);
+
+static struct attribute *acache_orangefs_default_attrs[] = {
+ &acache_hard_limit_attribute.attr,
+ &acache_reclaim_percent_attribute.attr,
+ &acache_soft_limit_attribute.attr,
+ &acache_timeout_msecs_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type acache_orangefs_ktype = {
+ .sysfs_ops = &acache_orangefs_sysfs_ops,
+ .release = acache_orangefs_release,
+ .default_attrs = acache_orangefs_default_attrs,
+};
+
+static struct capcache_orangefs_attribute capcache_hard_limit_attribute =
+ __ATTR(hard_limit,
+ 0664,
+ service_capcache_show,
+ service_capcache_store);
+
+static struct capcache_orangefs_attribute capcache_reclaim_percent_attribute =
+ __ATTR(reclaim_percentage,
+ 0664,
+ service_capcache_show,
+ service_capcache_store);
+
+static struct capcache_orangefs_attribute capcache_soft_limit_attribute =
+ __ATTR(soft_limit,
+ 0664,
+ service_capcache_show,
+ service_capcache_store);
+
+static struct capcache_orangefs_attribute capcache_timeout_secs_attribute =
+ __ATTR(timeout_secs,
+ 0664,
+ service_capcache_show,
+ service_capcache_store);
+
+static struct attribute *capcache_orangefs_default_attrs[] = {
+ &capcache_hard_limit_attribute.attr,
+ &capcache_reclaim_percent_attribute.attr,
+ &capcache_soft_limit_attribute.attr,
+ &capcache_timeout_secs_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type capcache_orangefs_ktype = {
+ .sysfs_ops = &capcache_orangefs_sysfs_ops,
+ .release = capcache_orangefs_release,
+ .default_attrs = capcache_orangefs_default_attrs,
+};
+
+static struct ccache_orangefs_attribute ccache_hard_limit_attribute =
+ __ATTR(hard_limit,
+ 0664,
+ service_ccache_show,
+ service_ccache_store);
+
+static struct ccache_orangefs_attribute ccache_reclaim_percent_attribute =
+ __ATTR(reclaim_percentage,
+ 0664,
+ service_ccache_show,
+ service_ccache_store);
+
+static struct ccache_orangefs_attribute ccache_soft_limit_attribute =
+ __ATTR(soft_limit,
+ 0664,
+ service_ccache_show,
+ service_ccache_store);
+
+static struct ccache_orangefs_attribute ccache_timeout_secs_attribute =
+ __ATTR(timeout_secs,
+ 0664,
+ service_ccache_show,
+ service_ccache_store);
+
+static struct attribute *ccache_orangefs_default_attrs[] = {
+ &ccache_hard_limit_attribute.attr,
+ &ccache_reclaim_percent_attribute.attr,
+ &ccache_soft_limit_attribute.attr,
+ &ccache_timeout_secs_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type ccache_orangefs_ktype = {
+ .sysfs_ops = &ccache_orangefs_sysfs_ops,
+ .release = ccache_orangefs_release,
+ .default_attrs = ccache_orangefs_default_attrs,
+};
+
+static struct ncache_orangefs_attribute ncache_hard_limit_attribute =
+ __ATTR(hard_limit,
+ 0664,
+ service_ncache_show,
+ service_ncache_store);
+
+static struct ncache_orangefs_attribute ncache_reclaim_percent_attribute =
+ __ATTR(reclaim_percentage,
+ 0664,
+ service_ncache_show,
+ service_ncache_store);
+
+static struct ncache_orangefs_attribute ncache_soft_limit_attribute =
+ __ATTR(soft_limit,
+ 0664,
+ service_ncache_show,
+ service_ncache_store);
+
+static struct ncache_orangefs_attribute ncache_timeout_msecs_attribute =
+ __ATTR(timeout_msecs,
+ 0664,
+ service_ncache_show,
+ service_ncache_store);
+
+static struct attribute *ncache_orangefs_default_attrs[] = {
+ &ncache_hard_limit_attribute.attr,
+ &ncache_reclaim_percent_attribute.attr,
+ &ncache_soft_limit_attribute.attr,
+ &ncache_timeout_msecs_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type ncache_orangefs_ktype = {
+ .sysfs_ops = &ncache_orangefs_sysfs_ops,
+ .release = ncache_orangefs_release,
+ .default_attrs = ncache_orangefs_default_attrs,
+};
+
+static struct pc_orangefs_attribute pc_acache_attribute =
+ __ATTR(acache,
+ 0664,
+ service_pc_show,
+ NULL);
+
+static struct pc_orangefs_attribute pc_capcache_attribute =
+ __ATTR(capcache,
+ 0664,
+ service_pc_show,
+ NULL);
+
+static struct pc_orangefs_attribute pc_ncache_attribute =
+ __ATTR(ncache,
+ 0664,
+ service_pc_show,
+ NULL);
+
+static struct attribute *pc_orangefs_default_attrs[] = {
+ &pc_acache_attribute.attr,
+ &pc_capcache_attribute.attr,
+ &pc_ncache_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type pc_orangefs_ktype = {
+ .sysfs_ops = &pc_orangefs_sysfs_ops,
+ .release = pc_orangefs_release,
+ .default_attrs = pc_orangefs_default_attrs,
+};
+
+static struct stats_orangefs_attribute stats_reads_attribute =
+ __ATTR(reads,
+ 0664,
+ int_stats_show,
+ NULL);
+
+static struct stats_orangefs_attribute stats_writes_attribute =
+ __ATTR(writes,
+ 0664,
+ int_stats_show,
+ NULL);
+
+static struct attribute *stats_orangefs_default_attrs[] = {
+ &stats_reads_attribute.attr,
+ &stats_writes_attribute.attr,
+ NULL,
+};
+
+static struct kobj_type stats_orangefs_ktype = {
+ .sysfs_ops = &stats_orangefs_sysfs_ops,
+ .release = stats_orangefs_release,
+ .default_attrs = stats_orangefs_default_attrs,
+};
+
+static struct orangefs_obj *orangefs_obj;
+static struct acache_orangefs_obj *acache_orangefs_obj;
+static struct capcache_orangefs_obj *capcache_orangefs_obj;
+static struct ccache_orangefs_obj *ccache_orangefs_obj;
+static struct ncache_orangefs_obj *ncache_orangefs_obj;
+static struct pc_orangefs_obj *pc_orangefs_obj;
+static struct stats_orangefs_obj *stats_orangefs_obj;
+
+int orangefs_sysfs_init(void)
+{
+ int rc = -EINVAL;
+
+ gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_init: start\n");
+
+ /* create /sys/fs/orangefs. */
+ orangefs_obj = kzalloc(sizeof(*orangefs_obj), GFP_KERNEL);
+ if (!orangefs_obj)
+ goto out;
+
+ rc = kobject_init_and_add(&orangefs_obj->kobj,
+ &orangefs_ktype,
+ fs_kobj,
+ ORANGEFS_KOBJ_ID);
+
+ if (rc)
+ goto ofs_obj_bail;
+
+ kobject_uevent(&orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/acache. */
+ acache_orangefs_obj = kzalloc(sizeof(*acache_orangefs_obj), GFP_KERNEL);
+ if (!acache_orangefs_obj) {
+ rc = -EINVAL;
+ goto ofs_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&acache_orangefs_obj->kobj,
+ &acache_orangefs_ktype,
+ &orangefs_obj->kobj,
+ ACACHE_KOBJ_ID);
+
+ if (rc)
+ goto acache_obj_bail;
+
+ kobject_uevent(&acache_orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/capcache. */
+ capcache_orangefs_obj =
+ kzalloc(sizeof(*capcache_orangefs_obj), GFP_KERNEL);
+ if (!capcache_orangefs_obj) {
+ rc = -EINVAL;
+ goto acache_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&capcache_orangefs_obj->kobj,
+ &capcache_orangefs_ktype,
+ &orangefs_obj->kobj,
+ CAPCACHE_KOBJ_ID);
+ if (rc)
+ goto capcache_obj_bail;
+
+ kobject_uevent(&capcache_orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/ccache. */
+ ccache_orangefs_obj =
+ kzalloc(sizeof(*ccache_orangefs_obj), GFP_KERNEL);
+ if (!ccache_orangefs_obj) {
+ rc = -EINVAL;
+ goto capcache_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&ccache_orangefs_obj->kobj,
+ &ccache_orangefs_ktype,
+ &orangefs_obj->kobj,
+ CCACHE_KOBJ_ID);
+ if (rc)
+ goto ccache_obj_bail;
+
+ kobject_uevent(&ccache_orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/ncache. */
+ ncache_orangefs_obj = kzalloc(sizeof(*ncache_orangefs_obj), GFP_KERNEL);
+ if (!ncache_orangefs_obj) {
+ rc = -EINVAL;
+ goto ccache_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&ncache_orangefs_obj->kobj,
+ &ncache_orangefs_ktype,
+ &orangefs_obj->kobj,
+ NCACHE_KOBJ_ID);
+
+ if (rc)
+ goto ncache_obj_bail;
+
+ kobject_uevent(&ncache_orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/perf_counters. */
+ pc_orangefs_obj = kzalloc(sizeof(*pc_orangefs_obj), GFP_KERNEL);
+ if (!pc_orangefs_obj) {
+ rc = -EINVAL;
+ goto ncache_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&pc_orangefs_obj->kobj,
+ &pc_orangefs_ktype,
+ &orangefs_obj->kobj,
+ "perf_counters");
+
+ if (rc)
+ goto pc_obj_bail;
+
+ kobject_uevent(&pc_orangefs_obj->kobj, KOBJ_ADD);
+
+ /* create /sys/fs/orangefs/stats. */
+ stats_orangefs_obj = kzalloc(sizeof(*stats_orangefs_obj), GFP_KERNEL);
+ if (!stats_orangefs_obj) {
+ rc = -EINVAL;
+ goto pc_obj_bail;
+ }
+
+ rc = kobject_init_and_add(&stats_orangefs_obj->kobj,
+ &stats_orangefs_ktype,
+ &orangefs_obj->kobj,
+ STATS_KOBJ_ID);
+
+ if (rc)
+ goto stats_obj_bail;
+
+ kobject_uevent(&stats_orangefs_obj->kobj, KOBJ_ADD);
+ goto out;
+
+stats_obj_bail:
+ kobject_put(&stats_orangefs_obj->kobj);
+
+pc_obj_bail:
+ kobject_put(&pc_orangefs_obj->kobj);
+
+ncache_obj_bail:
+ kobject_put(&ncache_orangefs_obj->kobj);
+
+ccache_obj_bail:
+ kobject_put(&ccache_orangefs_obj->kobj);
+
+capcache_obj_bail:
+ kobject_put(&capcache_orangefs_obj->kobj);
+
+acache_obj_bail:
+ kobject_put(&acache_orangefs_obj->kobj);
+
+ofs_obj_bail:
+ kobject_put(&orangefs_obj->kobj);
+out:
+ return rc;
+}
+
+void orangefs_sysfs_exit(void)
+{
+ gossip_debug(GOSSIP_SYSFS_DEBUG, "orangefs_sysfs_exit: start\n");
+
+ kobject_put(&acache_orangefs_obj->kobj);
+ kobject_put(&capcache_orangefs_obj->kobj);
+ kobject_put(&ccache_orangefs_obj->kobj);
+ kobject_put(&ncache_orangefs_obj->kobj);
+ kobject_put(&pc_orangefs_obj->kobj);
+ kobject_put(&stats_orangefs_obj->kobj);
+
+ kobject_put(&orangefs_obj->kobj);
+}
diff --git a/fs/orangefs/orangefs-sysfs.h b/fs/orangefs/orangefs-sysfs.h
new file mode 100644
index 000000000000..f0b76382db02
--- /dev/null
+++ b/fs/orangefs/orangefs-sysfs.h
@@ -0,0 +1,2 @@
+extern int orangefs_sysfs_init(void);
+extern void orangefs_sysfs_exit(void);
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
new file mode 100644
index 000000000000..2d129b5886ee
--- /dev/null
+++ b/fs/orangefs/orangefs-utils.c
@@ -0,0 +1,1052 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-dev-proto.h"
+#include "orangefs-bufmap.h"
+
+__s32 fsid_of_op(struct orangefs_kernel_op_s *op)
+{
+ __s32 fsid = ORANGEFS_FS_ID_NULL;
+
+ if (op) {
+ switch (op->upcall.type) {
+ case ORANGEFS_VFS_OP_FILE_IO:
+ fsid = op->upcall.req.io.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_LOOKUP:
+ fsid = op->upcall.req.lookup.parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_CREATE:
+ fsid = op->upcall.req.create.parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_GETATTR:
+ fsid = op->upcall.req.getattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_REMOVE:
+ fsid = op->upcall.req.remove.parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_MKDIR:
+ fsid = op->upcall.req.mkdir.parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_READDIR:
+ fsid = op->upcall.req.readdir.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_SETATTR:
+ fsid = op->upcall.req.setattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_SYMLINK:
+ fsid = op->upcall.req.sym.parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_RENAME:
+ fsid = op->upcall.req.rename.old_parent_refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_STATFS:
+ fsid = op->upcall.req.statfs.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_TRUNCATE:
+ fsid = op->upcall.req.truncate.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_MMAP_RA_FLUSH:
+ fsid = op->upcall.req.ra_cache_flush.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_FS_UMOUNT:
+ fsid = op->upcall.req.fs_umount.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_GETXATTR:
+ fsid = op->upcall.req.getxattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_SETXATTR:
+ fsid = op->upcall.req.setxattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_LISTXATTR:
+ fsid = op->upcall.req.listxattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_REMOVEXATTR:
+ fsid = op->upcall.req.removexattr.refn.fs_id;
+ break;
+ case ORANGEFS_VFS_OP_FSYNC:
+ fsid = op->upcall.req.fsync.refn.fs_id;
+ break;
+ default:
+ break;
+ }
+ }
+ return fsid;
+}
+
+static int orangefs_inode_flags(struct ORANGEFS_sys_attr_s *attrs)
+{
+ int flags = 0;
+ if (attrs->flags & ORANGEFS_IMMUTABLE_FL)
+ flags |= S_IMMUTABLE;
+ else
+ flags &= ~S_IMMUTABLE;
+ if (attrs->flags & ORANGEFS_APPEND_FL)
+ flags |= S_APPEND;
+ else
+ flags &= ~S_APPEND;
+ if (attrs->flags & ORANGEFS_NOATIME_FL)
+ flags |= S_NOATIME;
+ else
+ flags &= ~S_NOATIME;
+ return flags;
+}
+
+static int orangefs_inode_perms(struct ORANGEFS_sys_attr_s *attrs)
+{
+ int perm_mode = 0;
+
+ if (attrs->perms & ORANGEFS_O_EXECUTE)
+ perm_mode |= S_IXOTH;
+ if (attrs->perms & ORANGEFS_O_WRITE)
+ perm_mode |= S_IWOTH;
+ if (attrs->perms & ORANGEFS_O_READ)
+ perm_mode |= S_IROTH;
+
+ if (attrs->perms & ORANGEFS_G_EXECUTE)
+ perm_mode |= S_IXGRP;
+ if (attrs->perms & ORANGEFS_G_WRITE)
+ perm_mode |= S_IWGRP;
+ if (attrs->perms & ORANGEFS_G_READ)
+ perm_mode |= S_IRGRP;
+
+ if (attrs->perms & ORANGEFS_U_EXECUTE)
+ perm_mode |= S_IXUSR;
+ if (attrs->perms & ORANGEFS_U_WRITE)
+ perm_mode |= S_IWUSR;
+ if (attrs->perms & ORANGEFS_U_READ)
+ perm_mode |= S_IRUSR;
+
+ if (attrs->perms & ORANGEFS_G_SGID)
+ perm_mode |= S_ISGID;
+ if (attrs->perms & ORANGEFS_U_SUID)
+ perm_mode |= S_ISUID;
+
+ return perm_mode;
+}
+
+/*
+ * NOTE: in kernel land, we never use the sys_attr->link_target for
+ * anything, so don't bother copying it into the sys_attr object here.
+ */
+static inline int copy_attributes_from_inode(struct inode *inode,
+ struct ORANGEFS_sys_attr_s *attrs,
+ struct iattr *iattr)
+{
+ umode_t tmp_mode;
+
+ if (!iattr || !inode || !attrs) {
+ gossip_err("NULL iattr (%p), inode (%p), attrs (%p) "
+ "in copy_attributes_from_inode!\n",
+ iattr,
+ inode,
+ attrs);
+ return -EINVAL;
+ }
+ /*
+ * We need to be careful to only copy the attributes out of the
+ * iattr object that we know are valid.
+ */
+ attrs->mask = 0;
+ if (iattr->ia_valid & ATTR_UID) {
+ attrs->owner = from_kuid(current_user_ns(), iattr->ia_uid);
+ attrs->mask |= ORANGEFS_ATTR_SYS_UID;
+ gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner);
+ }
+ if (iattr->ia_valid & ATTR_GID) {
+ attrs->group = from_kgid(current_user_ns(), iattr->ia_gid);
+ attrs->mask |= ORANGEFS_ATTR_SYS_GID;
+ gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group);
+ }
+
+ if (iattr->ia_valid & ATTR_ATIME) {
+ attrs->mask |= ORANGEFS_ATTR_SYS_ATIME;
+ if (iattr->ia_valid & ATTR_ATIME_SET) {
+ attrs->atime = (time64_t)iattr->ia_atime.tv_sec;
+ attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET;
+ }
+ }
+ if (iattr->ia_valid & ATTR_MTIME) {
+ attrs->mask |= ORANGEFS_ATTR_SYS_MTIME;
+ if (iattr->ia_valid & ATTR_MTIME_SET) {
+ attrs->mtime = (time64_t)iattr->ia_mtime.tv_sec;
+ attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET;
+ }
+ }
+ if (iattr->ia_valid & ATTR_CTIME)
+ attrs->mask |= ORANGEFS_ATTR_SYS_CTIME;
+
+ /*
+ * ORANGEFS cannot set size with a setattr operation. Probably not likely
+ * to be requested through the VFS, but just in case, don't worry about
+ * ATTR_SIZE
+ */
+
+ if (iattr->ia_valid & ATTR_MODE) {
+ tmp_mode = iattr->ia_mode;
+ if (tmp_mode & (S_ISVTX)) {
+ if (is_root_handle(inode)) {
+ /*
+ * allow sticky bit to be set on root (since
+ * it shows up that way by default anyhow),
+ * but don't show it to the server
+ */
+ tmp_mode -= S_ISVTX;
+ } else {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
+ return -EINVAL;
+ }
+ }
+
+ if (tmp_mode & (S_ISUID)) {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Attempting to set setuid bit (not supported); returning EINVAL.\n");
+ return -EINVAL;
+ }
+
+ attrs->perms = ORANGEFS_util_translate_mode(tmp_mode);
+ attrs->mask |= ORANGEFS_ATTR_SYS_PERM;
+ }
+
+ return 0;
+}
+
+static int orangefs_inode_type(enum orangefs_ds_type objtype)
+{
+ if (objtype == ORANGEFS_TYPE_METAFILE)
+ return S_IFREG;
+ else if (objtype == ORANGEFS_TYPE_DIRECTORY)
+ return S_IFDIR;
+ else if (objtype == ORANGEFS_TYPE_SYMLINK)
+ return S_IFLNK;
+ else
+ return -1;
+}
+
+static int orangefs_inode_is_stale(struct inode *inode, int new,
+ struct ORANGEFS_sys_attr_s *attrs, char *link_target)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ int type = orangefs_inode_type(attrs->objtype);
+ if (!new) {
+ /*
+ * If the inode type or symlink target have changed then this
+ * inode is stale.
+ */
+ if (type == -1 || !(inode->i_mode & type)) {
+ orangefs_make_bad_inode(inode);
+ return 1;
+ }
+ if (type == S_IFLNK && strncmp(orangefs_inode->link_target,
+ link_target, ORANGEFS_NAME_MAX)) {
+ orangefs_make_bad_inode(inode);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int orangefs_inode_getattr(struct inode *inode, int new, int size)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ loff_t inode_size, rounded_up_size;
+ int ret, type;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
+ get_khandle_from_ino(inode));
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.getattr.refn = orangefs_inode->refn;
+ new_op->upcall.req.getattr.mask = size ?
+ ORANGEFS_ATTR_SYS_ALL_NOHINT : ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE;
+
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+ if (ret != 0)
+ goto out;
+
+ type = orangefs_inode_type(new_op->
+ downcall.resp.getattr.attributes.objtype);
+ ret = orangefs_inode_is_stale(inode, new,
+ &new_op->downcall.resp.getattr.attributes,
+ new_op->downcall.resp.getattr.link_target);
+ if (ret) {
+ ret = -ESTALE;
+ goto out;
+ }
+
+ switch (type) {
+ case S_IFREG:
+ inode->i_flags = orangefs_inode_flags(&new_op->
+ downcall.resp.getattr.attributes);
+ if (size) {
+ inode_size = (loff_t)new_op->
+ downcall.resp.getattr.attributes.size;
+ rounded_up_size =
+ (inode_size + (4096 - (inode_size % 4096)));
+ inode->i_size = inode_size;
+ orangefs_inode->blksize =
+ new_op->downcall.resp.getattr.attributes.blksize;
+ spin_lock(&inode->i_lock);
+ inode->i_bytes = inode_size;
+ inode->i_blocks =
+ (unsigned long)(rounded_up_size / 512);
+ spin_unlock(&inode->i_lock);
+ }
+ break;
+ case S_IFDIR:
+ inode->i_size = PAGE_SIZE;
+ orangefs_inode->blksize = (1 << inode->i_blkbits);
+ spin_lock(&inode->i_lock);
+ inode_set_bytes(inode, inode->i_size);
+ spin_unlock(&inode->i_lock);
+ set_nlink(inode, 1);
+ break;
+ case S_IFLNK:
+ if (new) {
+ inode->i_size = (loff_t)strlen(new_op->
+ downcall.resp.getattr.link_target);
+ orangefs_inode->blksize = (1 << inode->i_blkbits);
+ ret = strscpy(orangefs_inode->link_target,
+ new_op->downcall.resp.getattr.link_target,
+ ORANGEFS_NAME_MAX);
+ if (ret == -E2BIG) {
+ ret = -EIO;
+ goto out;
+ }
+ inode->i_link = orangefs_inode->link_target;
+ }
+ break;
+ }
+
+ inode->i_uid = make_kuid(&init_user_ns, new_op->
+ downcall.resp.getattr.attributes.owner);
+ inode->i_gid = make_kgid(&init_user_ns, new_op->
+ downcall.resp.getattr.attributes.group);
+ inode->i_atime.tv_sec = (time64_t)new_op->
+ downcall.resp.getattr.attributes.atime;
+ inode->i_mtime.tv_sec = (time64_t)new_op->
+ downcall.resp.getattr.attributes.mtime;
+ inode->i_ctime.tv_sec = (time64_t)new_op->
+ downcall.resp.getattr.attributes.ctime;
+ inode->i_atime.tv_nsec = 0;
+ inode->i_mtime.tv_nsec = 0;
+ inode->i_ctime.tv_nsec = 0;
+
+ /* special case: mark the root inode as sticky */
+ inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
+ orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
+
+ ret = 0;
+out:
+ op_release(new_op);
+ return ret;
+}
+
+int orangefs_inode_check_changed(struct inode *inode)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ int ret;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
+ get_khandle_from_ino(inode));
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.getattr.refn = orangefs_inode->refn;
+ new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_TYPE |
+ ORANGEFS_ATTR_SYS_LNK_TARGET;
+
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+ if (ret != 0)
+ goto out;
+
+ ret = orangefs_inode_is_stale(inode, 0,
+ &new_op->downcall.resp.getattr.attributes,
+ new_op->downcall.resp.getattr.link_target);
+out:
+ op_release(new_op);
+ return ret;
+}
+
+/*
+ * issues a orangefs setattr request to make sure the new attribute values
+ * take effect if successful. returns 0 on success; -errno otherwise
+ */
+int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ int ret;
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_SETATTR);
+ if (!new_op)
+ return -ENOMEM;
+
+ new_op->upcall.req.setattr.refn = orangefs_inode->refn;
+ ret = copy_attributes_from_inode(inode,
+ &new_op->upcall.req.setattr.attributes,
+ iattr);
+ if (ret >= 0) {
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_inode_setattr: returning %d\n",
+ ret);
+ }
+
+ op_release(new_op);
+
+ /*
+ * successful setattr should clear the atime, mtime and
+ * ctime flags.
+ */
+ if (ret == 0) {
+ ClearAtimeFlag(orangefs_inode);
+ ClearMtimeFlag(orangefs_inode);
+ ClearCtimeFlag(orangefs_inode);
+ ClearModeFlag(orangefs_inode);
+ }
+
+ return ret;
+}
+
+int orangefs_flush_inode(struct inode *inode)
+{
+ /*
+ * If it is a dirty inode, this function gets called.
+ * Gather all the information that needs to be setattr'ed
+ * Right now, this will only be used for mode, atime, mtime
+ * and/or ctime.
+ */
+ struct iattr wbattr;
+ int ret;
+ int mtime_flag;
+ int ctime_flag;
+ int atime_flag;
+ int mode_flag;
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+
+ memset(&wbattr, 0, sizeof(wbattr));
+
+ /*
+ * check inode flags up front, and clear them if they are set. This
+ * will prevent multiple processes from all trying to flush the same
+ * inode if they call close() simultaneously
+ */
+ mtime_flag = MtimeFlag(orangefs_inode);
+ ClearMtimeFlag(orangefs_inode);
+ ctime_flag = CtimeFlag(orangefs_inode);
+ ClearCtimeFlag(orangefs_inode);
+ atime_flag = AtimeFlag(orangefs_inode);
+ ClearAtimeFlag(orangefs_inode);
+ mode_flag = ModeFlag(orangefs_inode);
+ ClearModeFlag(orangefs_inode);
+
+ /* -- Lazy atime,mtime and ctime update --
+ * Note: all times are dictated by server in the new scheme
+ * and not by the clients
+ *
+ * Also mode updates are being handled now..
+ */
+
+ if (mtime_flag)
+ wbattr.ia_valid |= ATTR_MTIME;
+ if (ctime_flag)
+ wbattr.ia_valid |= ATTR_CTIME;
+ if (atime_flag)
+ wbattr.ia_valid |= ATTR_ATIME;
+
+ if (mode_flag) {
+ wbattr.ia_mode = inode->i_mode;
+ wbattr.ia_valid |= ATTR_MODE;
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "*********** orangefs_flush_inode: %pU "
+ "(ia_valid %d)\n",
+ get_khandle_from_ino(inode),
+ wbattr.ia_valid);
+ if (wbattr.ia_valid == 0) {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_flush_inode skipping setattr()\n");
+ return 0;
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_flush_inode (%pU) writing mode %o\n",
+ get_khandle_from_ino(inode),
+ inode->i_mode);
+
+ ret = orangefs_inode_setattr(inode, &wbattr);
+
+ return ret;
+}
+
+int orangefs_unmount_sb(struct super_block *sb)
+{
+ int ret = -EINVAL;
+ struct orangefs_kernel_op_s *new_op = NULL;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_unmount_sb called on sb %p\n",
+ sb);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id;
+ new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id;
+ strncpy(new_op->upcall.req.fs_umount.orangefs_config_server,
+ ORANGEFS_SB(sb)->devname,
+ ORANGEFS_MAX_SERVER_ADDR_LEN);
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "Attempting ORANGEFS Unmount via host %s\n",
+ new_op->upcall.req.fs_umount.orangefs_config_server);
+
+ ret = service_operation(new_op, "orangefs_fs_umount", 0);
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "orangefs_unmount: got return value of %d\n", ret);
+ if (ret)
+ sb = ERR_PTR(ret);
+ else
+ ORANGEFS_SB(sb)->mount_pending = 1;
+
+ op_release(new_op);
+ return ret;
+}
+
+void orangefs_make_bad_inode(struct inode *inode)
+{
+ if (is_root_handle(inode)) {
+ /*
+ * if this occurs, the pvfs2-client-core was killed but we
+ * can't afford to lose the inode operations and such
+ * associated with the root handle in any case.
+ */
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "*** NOT making bad root inode %pU\n",
+ get_khandle_from_ino(inode));
+ } else {
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "*** making bad inode %pU\n",
+ get_khandle_from_ino(inode));
+ make_bad_inode(inode);
+ }
+}
+
+/*
+ * The following is a very dirty hack that is now a permanent part of the
+ * ORANGEFS protocol. See protocol.h for more error definitions.
+ */
+
+/* The order matches include/orangefs-types.h in the OrangeFS source. */
+static int PINT_errno_mapping[] = {
+ 0, EPERM, ENOENT, EINTR, EIO, ENXIO, EBADF, EAGAIN, ENOMEM,
+ EFAULT, EBUSY, EEXIST, ENODEV, ENOTDIR, EISDIR, EINVAL, EMFILE,
+ EFBIG, ENOSPC, EROFS, EMLINK, EPIPE, EDEADLK, ENAMETOOLONG,
+ ENOLCK, ENOSYS, ENOTEMPTY, ELOOP, EWOULDBLOCK, ENOMSG, EUNATCH,
+ EBADR, EDEADLOCK, ENODATA, ETIME, ENONET, EREMOTE, ECOMM,
+ EPROTO, EBADMSG, EOVERFLOW, ERESTART, EMSGSIZE, EPROTOTYPE,
+ ENOPROTOOPT, EPROTONOSUPPORT, EOPNOTSUPP, EADDRINUSE,
+ EADDRNOTAVAIL, ENETDOWN, ENETUNREACH, ENETRESET, ENOBUFS,
+ ETIMEDOUT, ECONNREFUSED, EHOSTDOWN, EHOSTUNREACH, EALREADY,
+ EACCES, ECONNRESET, ERANGE
+};
+
+int orangefs_normalize_to_errno(__s32 error_code)
+{
+ __u32 i;
+
+ /* Success */
+ if (error_code == 0) {
+ return 0;
+ /*
+ * This shouldn't ever happen. If it does it should be fixed on the
+ * server.
+ */
+ } else if (error_code > 0) {
+ gossip_err("orangefs: error status receieved.\n");
+ gossip_err("orangefs: assuming error code is inverted.\n");
+ error_code = -error_code;
+ }
+
+ /*
+ * XXX: This is very bad since error codes from ORANGEFS may not be
+ * suitable for return into userspace.
+ */
+
+ /*
+ * Convert ORANGEFS error values into errno values suitable for return
+ * from the kernel.
+ */
+ if ((-error_code) & ORANGEFS_NON_ERRNO_ERROR_BIT) {
+ if (((-error_code) &
+ (ORANGEFS_ERROR_NUMBER_BITS|ORANGEFS_NON_ERRNO_ERROR_BIT|
+ ORANGEFS_ERROR_BIT)) == ORANGEFS_ECANCEL) {
+ /*
+ * cancellation error codes generally correspond to
+ * a timeout from the client's perspective
+ */
+ error_code = -ETIMEDOUT;
+ } else {
+ /* assume a default error code */
+ gossip_err("orangefs: warning: got error code without errno equivalent: %d.\n", error_code);
+ error_code = -EINVAL;
+ }
+
+ /* Convert ORANGEFS encoded errno values into regular errno values. */
+ } else if ((-error_code) & ORANGEFS_ERROR_BIT) {
+ i = (-error_code) & ~(ORANGEFS_ERROR_BIT|ORANGEFS_ERROR_CLASS_BITS);
+ if (i < sizeof(PINT_errno_mapping)/sizeof(*PINT_errno_mapping))
+ error_code = -PINT_errno_mapping[i];
+ else
+ error_code = -EINVAL;
+
+ /*
+ * Only ORANGEFS protocol error codes should ever come here. Otherwise
+ * there is a bug somewhere.
+ */
+ } else {
+ gossip_err("orangefs: orangefs_normalize_to_errno: got error code which is not from ORANGEFS.\n");
+ }
+ return error_code;
+}
+
+#define NUM_MODES 11
+__s32 ORANGEFS_util_translate_mode(int mode)
+{
+ int ret = 0;
+ int i = 0;
+ static int modes[NUM_MODES] = {
+ S_IXOTH, S_IWOTH, S_IROTH,
+ S_IXGRP, S_IWGRP, S_IRGRP,
+ S_IXUSR, S_IWUSR, S_IRUSR,
+ S_ISGID, S_ISUID
+ };
+ static int orangefs_modes[NUM_MODES] = {
+ ORANGEFS_O_EXECUTE, ORANGEFS_O_WRITE, ORANGEFS_O_READ,
+ ORANGEFS_G_EXECUTE, ORANGEFS_G_WRITE, ORANGEFS_G_READ,
+ ORANGEFS_U_EXECUTE, ORANGEFS_U_WRITE, ORANGEFS_U_READ,
+ ORANGEFS_G_SGID, ORANGEFS_U_SUID
+ };
+
+ for (i = 0; i < NUM_MODES; i++)
+ if (mode & modes[i])
+ ret |= orangefs_modes[i];
+
+ return ret;
+}
+#undef NUM_MODES
+
+/*
+ * After obtaining a string representation of the client's debug
+ * keywords and their associated masks, this function is called to build an
+ * array of these values.
+ */
+int orangefs_prepare_cdm_array(char *debug_array_string)
+{
+ int i;
+ int rc = -EINVAL;
+ char *cds_head = NULL;
+ char *cds_delimiter = NULL;
+ int keyword_len = 0;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ /*
+ * figure out how many elements the cdm_array needs.
+ */
+ for (i = 0; i < strlen(debug_array_string); i++)
+ if (debug_array_string[i] == '\n')
+ cdm_element_count++;
+
+ if (!cdm_element_count) {
+ pr_info("No elements in client debug array string!\n");
+ goto out;
+ }
+
+ cdm_array =
+ kzalloc(cdm_element_count * sizeof(struct client_debug_mask),
+ GFP_KERNEL);
+ if (!cdm_array) {
+ pr_info("malloc failed for cdm_array!\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ cds_head = debug_array_string;
+
+ for (i = 0; i < cdm_element_count; i++) {
+ cds_delimiter = strchr(cds_head, '\n');
+ *cds_delimiter = '\0';
+
+ keyword_len = strcspn(cds_head, " ");
+
+ cdm_array[i].keyword = kzalloc(keyword_len + 1, GFP_KERNEL);
+ if (!cdm_array[i].keyword) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ sscanf(cds_head,
+ "%s %llx %llx",
+ cdm_array[i].keyword,
+ (unsigned long long *)&(cdm_array[i].mask1),
+ (unsigned long long *)&(cdm_array[i].mask2));
+
+ if (!strcmp(cdm_array[i].keyword, ORANGEFS_VERBOSE))
+ client_verbose_index = i;
+
+ if (!strcmp(cdm_array[i].keyword, ORANGEFS_ALL))
+ client_all_index = i;
+
+ cds_head = cds_delimiter + 1;
+ }
+
+ rc = cdm_element_count;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: rc:%d:\n", __func__, rc);
+
+out:
+
+ return rc;
+
+}
+
+/*
+ * /sys/kernel/debug/orangefs/debug-help can be catted to
+ * see all the available kernel and client debug keywords.
+ *
+ * When the kernel boots, we have no idea what keywords the
+ * client supports, nor their associated masks.
+ *
+ * We pass through this function once at boot and stamp a
+ * boilerplate "we don't know" message for the client in the
+ * debug-help file. We pass through here again when the client
+ * starts and then we can fill out the debug-help file fully.
+ *
+ * The client might be restarted any number of times between
+ * reboots, we only build the debug-help file the first time.
+ */
+int orangefs_prepare_debugfs_help_string(int at_boot)
+{
+ int rc = -EINVAL;
+ int i;
+ int byte_count = 0;
+ char *client_title = "Client Debug Keywords:\n";
+ char *kernel_title = "Kernel Debug Keywords:\n";
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ if (at_boot) {
+ byte_count += strlen(HELP_STRING_UNINITIALIZED);
+ client_title = HELP_STRING_UNINITIALIZED;
+ } else {
+ /*
+ * fill the client keyword/mask array and remember
+ * how many elements there were.
+ */
+ cdm_element_count =
+ orangefs_prepare_cdm_array(client_debug_array_string);
+ if (cdm_element_count <= 0)
+ goto out;
+
+ /* Count the bytes destined for debug_help_string. */
+ byte_count += strlen(client_title);
+
+ for (i = 0; i < cdm_element_count; i++) {
+ byte_count += strlen(cdm_array[i].keyword + 2);
+ if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+ pr_info("%s: overflow 1!\n", __func__);
+ goto out;
+ }
+ }
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "%s: cdm_element_count:%d:\n",
+ __func__,
+ cdm_element_count);
+ }
+
+ byte_count += strlen(kernel_title);
+ for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+ byte_count +=
+ strlen(s_kmod_keyword_mask_map[i].keyword + 2);
+ if (byte_count >= DEBUG_HELP_STRING_SIZE) {
+ pr_info("%s: overflow 2!\n", __func__);
+ goto out;
+ }
+ }
+
+ /* build debug_help_string. */
+ debug_help_string = kzalloc(DEBUG_HELP_STRING_SIZE, GFP_KERNEL);
+ if (!debug_help_string) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ strcat(debug_help_string, client_title);
+
+ if (!at_boot) {
+ for (i = 0; i < cdm_element_count; i++) {
+ strcat(debug_help_string, "\t");
+ strcat(debug_help_string, cdm_array[i].keyword);
+ strcat(debug_help_string, "\n");
+ }
+ }
+
+ strcat(debug_help_string, "\n");
+ strcat(debug_help_string, kernel_title);
+
+ for (i = 0; i < num_kmod_keyword_mask_map; i++) {
+ strcat(debug_help_string, "\t");
+ strcat(debug_help_string, s_kmod_keyword_mask_map[i].keyword);
+ strcat(debug_help_string, "\n");
+ }
+
+ rc = 0;
+
+out:
+
+ return rc;
+
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+void debug_mask_to_string(void *mask, int type)
+{
+ int i;
+ int len = 0;
+ char *debug_string;
+ int element_count = 0;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ if (type) {
+ debug_string = client_debug_string;
+ element_count = cdm_element_count;
+ } else {
+ debug_string = kernel_debug_string;
+ element_count = num_kmod_keyword_mask_map;
+ }
+
+ memset(debug_string, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+
+ /*
+ * Some keywords, like "all" or "verbose", are amalgams of
+ * numerous other keywords. Make a special check for those
+ * before grinding through the whole mask only to find out
+ * later...
+ */
+ if (check_amalgam_keyword(mask, type))
+ goto out;
+
+ /* Build the debug string. */
+ for (i = 0; i < element_count; i++)
+ if (type)
+ do_c_string(mask, i);
+ else
+ do_k_string(mask, i);
+
+ len = strlen(debug_string);
+
+ if ((len) && (type))
+ client_debug_string[len - 1] = '\0';
+ else if (len)
+ kernel_debug_string[len - 1] = '\0';
+ else if (type)
+ strcpy(client_debug_string, "none");
+ else
+ strcpy(kernel_debug_string, "none");
+
+out:
+gossip_debug(GOSSIP_UTILS_DEBUG, "%s: string:%s:\n", __func__, debug_string);
+
+ return;
+
+}
+
+void do_k_string(void *k_mask, int index)
+{
+ __u64 *mask = (__u64 *) k_mask;
+
+ if (keyword_is_amalgam((char *) s_kmod_keyword_mask_map[index].keyword))
+ goto out;
+
+ if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
+ if ((strlen(kernel_debug_string) +
+ strlen(s_kmod_keyword_mask_map[index].keyword))
+ < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
+ strcat(kernel_debug_string,
+ s_kmod_keyword_mask_map[index].keyword);
+ strcat(kernel_debug_string, ",");
+ } else {
+ gossip_err("%s: overflow!\n", __func__);
+ strcpy(kernel_debug_string, ORANGEFS_ALL);
+ goto out;
+ }
+ }
+
+out:
+
+ return;
+}
+
+void do_c_string(void *c_mask, int index)
+{
+ struct client_debug_mask *mask = (struct client_debug_mask *) c_mask;
+
+ if (keyword_is_amalgam(cdm_array[index].keyword))
+ goto out;
+
+ if ((mask->mask1 & cdm_array[index].mask1) ||
+ (mask->mask2 & cdm_array[index].mask2)) {
+ if ((strlen(client_debug_string) +
+ strlen(cdm_array[index].keyword) + 1)
+ < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
+ strcat(client_debug_string,
+ cdm_array[index].keyword);
+ strcat(client_debug_string, ",");
+ } else {
+ gossip_err("%s: overflow!\n", __func__);
+ strcpy(client_debug_string, ORANGEFS_ALL);
+ goto out;
+ }
+ }
+out:
+ return;
+}
+
+int keyword_is_amalgam(char *keyword)
+{
+ int rc = 0;
+
+ if ((!strcmp(keyword, ORANGEFS_ALL)) || (!strcmp(keyword, ORANGEFS_VERBOSE)))
+ rc = 1;
+
+ return rc;
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ *
+ * return 1 if we found an amalgam.
+ */
+int check_amalgam_keyword(void *mask, int type)
+{
+ __u64 *k_mask;
+ struct client_debug_mask *c_mask;
+ int k_all_index = num_kmod_keyword_mask_map - 1;
+ int rc = 0;
+
+ if (type) {
+ c_mask = (struct client_debug_mask *) mask;
+
+ if ((c_mask->mask1 == cdm_array[client_all_index].mask1) &&
+ (c_mask->mask2 == cdm_array[client_all_index].mask2)) {
+ strcpy(client_debug_string, ORANGEFS_ALL);
+ rc = 1;
+ goto out;
+ }
+
+ if ((c_mask->mask1 == cdm_array[client_verbose_index].mask1) &&
+ (c_mask->mask2 == cdm_array[client_verbose_index].mask2)) {
+ strcpy(client_debug_string, ORANGEFS_VERBOSE);
+ rc = 1;
+ goto out;
+ }
+
+ } else {
+ k_mask = (__u64 *) mask;
+
+ if (*k_mask >= s_kmod_keyword_mask_map[k_all_index].mask_val) {
+ strcpy(kernel_debug_string, ORANGEFS_ALL);
+ rc = 1;
+ goto out;
+ }
+ }
+
+out:
+
+ return rc;
+}
+
+/*
+ * kernel = type 0
+ * client = type 1
+ */
+void debug_string_to_mask(char *debug_string, void *mask, int type)
+{
+ char *unchecked_keyword;
+ int i;
+ char *strsep_fodder = kstrdup(debug_string, GFP_KERNEL);
+ char *original_pointer;
+ int element_count = 0;
+ struct client_debug_mask *c_mask;
+ __u64 *k_mask;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: start\n", __func__);
+
+ if (type) {
+ c_mask = (struct client_debug_mask *)mask;
+ element_count = cdm_element_count;
+ } else {
+ k_mask = (__u64 *)mask;
+ *k_mask = 0;
+ element_count = num_kmod_keyword_mask_map;
+ }
+
+ original_pointer = strsep_fodder;
+ while ((unchecked_keyword = strsep(&strsep_fodder, ",")))
+ if (strlen(unchecked_keyword)) {
+ for (i = 0; i < element_count; i++)
+ if (type)
+ do_c_mask(i,
+ unchecked_keyword,
+ &c_mask);
+ else
+ do_k_mask(i,
+ unchecked_keyword,
+ &k_mask);
+ }
+
+ kfree(original_pointer);
+}
+
+void do_c_mask(int i,
+ char *unchecked_keyword,
+ struct client_debug_mask **sane_mask)
+{
+
+ if (!strcmp(cdm_array[i].keyword, unchecked_keyword)) {
+ (**sane_mask).mask1 = (**sane_mask).mask1 | cdm_array[i].mask1;
+ (**sane_mask).mask2 = (**sane_mask).mask2 | cdm_array[i].mask2;
+ }
+}
+
+void do_k_mask(int i, char *unchecked_keyword, __u64 **sane_mask)
+{
+
+ if (!strcmp(s_kmod_keyword_mask_map[i].keyword, unchecked_keyword))
+ **sane_mask = (**sane_mask) |
+ s_kmod_keyword_mask_map[i].mask_val;
+}
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
new file mode 100644
index 000000000000..1efc6f8a5224
--- /dev/null
+++ b/fs/orangefs/protocol.h
@@ -0,0 +1,455 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/spinlock_types.h>
+#include <linux/slab.h>
+#include <linux/ioctl.h>
+
+extern struct client_debug_mask *cdm_array;
+extern char *debug_help_string;
+extern int help_string_initialized;
+extern struct dentry *debug_dir;
+extern struct dentry *help_file_dentry;
+extern struct dentry *client_debug_dentry;
+extern const struct file_operations debug_help_fops;
+extern int client_all_index;
+extern int client_verbose_index;
+extern int cdm_element_count;
+#define DEBUG_HELP_STRING_SIZE 4096
+#define HELP_STRING_UNINITIALIZED \
+ "Client Debug Keywords are unknown until the first time\n" \
+ "the client is started after boot.\n"
+#define ORANGEFS_KMOD_DEBUG_HELP_FILE "debug-help"
+#define ORANGEFS_KMOD_DEBUG_FILE "kernel-debug"
+#define ORANGEFS_CLIENT_DEBUG_FILE "client-debug"
+#define ORANGEFS_VERBOSE "verbose"
+#define ORANGEFS_ALL "all"
+
+/* pvfs2-config.h ***********************************************************/
+#define ORANGEFS_VERSION_MAJOR 2
+#define ORANGEFS_VERSION_MINOR 9
+#define ORANGEFS_VERSION_SUB 0
+
+/* khandle stuff ***********************************************************/
+
+/*
+ * The 2.9 core will put 64 bit handles in here like this:
+ * 1234 0000 0000 5678
+ * The 3.0 and beyond cores will put 128 bit handles in here like this:
+ * 1234 5678 90AB CDEF
+ * The kernel module will always use the first four bytes and
+ * the last four bytes as an inum.
+ */
+struct orangefs_khandle {
+ unsigned char u[16];
+} __aligned(8);
+
+/*
+ * kernel version of an object ref.
+ */
+struct orangefs_object_kref {
+ struct orangefs_khandle khandle;
+ __s32 fs_id;
+ __s32 __pad1;
+};
+
+/*
+ * compare 2 khandles assumes little endian thus from large address to
+ * small address
+ */
+static inline int ORANGEFS_khandle_cmp(const struct orangefs_khandle *kh1,
+ const struct orangefs_khandle *kh2)
+{
+ int i;
+
+ for (i = 15; i >= 0; i--) {
+ if (kh1->u[i] > kh2->u[i])
+ return 1;
+ if (kh1->u[i] < kh2->u[i])
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline void ORANGEFS_khandle_to(const struct orangefs_khandle *kh,
+ void *p, int size)
+{
+
+ memcpy(p, kh->u, 16);
+ memset(p + 16, 0, size - 16);
+
+}
+
+static inline void ORANGEFS_khandle_from(struct orangefs_khandle *kh,
+ void *p, int size)
+{
+ memset(kh, 0, 16);
+ memcpy(kh->u, p, 16);
+
+}
+
+/* pvfs2-types.h ************************************************************/
+typedef __u32 ORANGEFS_uid;
+typedef __u32 ORANGEFS_gid;
+typedef __s32 ORANGEFS_fs_id;
+typedef __u32 ORANGEFS_permissions;
+typedef __u64 ORANGEFS_time;
+typedef __s64 ORANGEFS_size;
+typedef __u64 ORANGEFS_flags;
+typedef __u64 ORANGEFS_ds_position;
+typedef __s32 ORANGEFS_error;
+typedef __s64 ORANGEFS_offset;
+
+#define ORANGEFS_SUPER_MAGIC 0x20030528
+
+/*
+ * ORANGEFS error codes are a signed 32-bit integer. Error codes are negative, but
+ * the sign is stripped before decoding.
+ */
+
+/* Bit 31 is not used since it is the sign. */
+
+/*
+ * Bit 30 specifies that this is a ORANGEFS error. A ORANGEFS error is either an
+ * encoded errno value or a ORANGEFS protocol error.
+ */
+#define ORANGEFS_ERROR_BIT (1 << 30)
+
+/*
+ * Bit 29 specifies that this is a ORANGEFS protocol error and not an encoded
+ * errno value.
+ */
+#define ORANGEFS_NON_ERRNO_ERROR_BIT (1 << 29)
+
+/*
+ * Bits 9, 8, and 7 specify the error class, which encodes the section of
+ * server code the error originated in for logging purposes. It is not used
+ * in the kernel except to be masked out.
+ */
+#define ORANGEFS_ERROR_CLASS_BITS 0x380
+
+/* Bits 6 - 0 are reserved for the actual error code. */
+#define ORANGEFS_ERROR_NUMBER_BITS 0x7f
+
+/* Encoded errno values decoded by PINT_errno_mapping in orangefs-utils.c. */
+
+/* Our own ORANGEFS protocol error codes. */
+#define ORANGEFS_ECANCEL (1|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EDEVINIT (2|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EDETAIL (3|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EHOSTNTFD (4|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_EADDRNTFD (5|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ENORECVR (6|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ETRYAGAIN (7|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ENOTPVFS (8|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+#define ORANGEFS_ESECURITY (9|ORANGEFS_NON_ERRNO_ERROR_BIT|ORANGEFS_ERROR_BIT)
+
+/* permission bits */
+#define ORANGEFS_O_EXECUTE (1 << 0)
+#define ORANGEFS_O_WRITE (1 << 1)
+#define ORANGEFS_O_READ (1 << 2)
+#define ORANGEFS_G_EXECUTE (1 << 3)
+#define ORANGEFS_G_WRITE (1 << 4)
+#define ORANGEFS_G_READ (1 << 5)
+#define ORANGEFS_U_EXECUTE (1 << 6)
+#define ORANGEFS_U_WRITE (1 << 7)
+#define ORANGEFS_U_READ (1 << 8)
+/* no ORANGEFS_U_VTX (sticky bit) */
+#define ORANGEFS_G_SGID (1 << 10)
+#define ORANGEFS_U_SUID (1 << 11)
+
+/* definition taken from stdint.h */
+#define INT32_MAX (2147483647)
+#define ORANGEFS_ITERATE_START (INT32_MAX - 1)
+#define ORANGEFS_ITERATE_END (INT32_MAX - 2)
+#define ORANGEFS_ITERATE_NEXT (INT32_MAX - 3)
+#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START
+#define ORANGEFS_READDIR_END ORANGEFS_ITERATE_END
+#define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL
+#define ORANGEFS_APPEND_FL FS_APPEND_FL
+#define ORANGEFS_NOATIME_FL FS_NOATIME_FL
+#define ORANGEFS_MIRROR_FL 0x01000000ULL
+#define ORANGEFS_O_EXECUTE (1 << 0)
+#define ORANGEFS_FS_ID_NULL ((__s32)0)
+#define ORANGEFS_ATTR_SYS_UID (1 << 0)
+#define ORANGEFS_ATTR_SYS_GID (1 << 1)
+#define ORANGEFS_ATTR_SYS_PERM (1 << 2)
+#define ORANGEFS_ATTR_SYS_ATIME (1 << 3)
+#define ORANGEFS_ATTR_SYS_CTIME (1 << 4)
+#define ORANGEFS_ATTR_SYS_MTIME (1 << 5)
+#define ORANGEFS_ATTR_SYS_TYPE (1 << 6)
+#define ORANGEFS_ATTR_SYS_ATIME_SET (1 << 7)
+#define ORANGEFS_ATTR_SYS_MTIME_SET (1 << 8)
+#define ORANGEFS_ATTR_SYS_SIZE (1 << 20)
+#define ORANGEFS_ATTR_SYS_LNK_TARGET (1 << 24)
+#define ORANGEFS_ATTR_SYS_DFILE_COUNT (1 << 25)
+#define ORANGEFS_ATTR_SYS_DIRENT_COUNT (1 << 26)
+#define ORANGEFS_ATTR_SYS_BLKSIZE (1 << 28)
+#define ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT (1 << 29)
+#define ORANGEFS_ATTR_SYS_COMMON_ALL \
+ (ORANGEFS_ATTR_SYS_UID | \
+ ORANGEFS_ATTR_SYS_GID | \
+ ORANGEFS_ATTR_SYS_PERM | \
+ ORANGEFS_ATTR_SYS_ATIME | \
+ ORANGEFS_ATTR_SYS_CTIME | \
+ ORANGEFS_ATTR_SYS_MTIME | \
+ ORANGEFS_ATTR_SYS_TYPE)
+
+#define ORANGEFS_ATTR_SYS_ALL_SETABLE \
+(ORANGEFS_ATTR_SYS_COMMON_ALL-ORANGEFS_ATTR_SYS_TYPE)
+
+#define ORANGEFS_ATTR_SYS_ALL_NOHINT \
+ (ORANGEFS_ATTR_SYS_COMMON_ALL | \
+ ORANGEFS_ATTR_SYS_SIZE | \
+ ORANGEFS_ATTR_SYS_LNK_TARGET | \
+ ORANGEFS_ATTR_SYS_DFILE_COUNT | \
+ ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \
+ ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
+ ORANGEFS_ATTR_SYS_BLKSIZE)
+
+#define ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE \
+ (ORANGEFS_ATTR_SYS_COMMON_ALL | \
+ ORANGEFS_ATTR_SYS_LNK_TARGET | \
+ ORANGEFS_ATTR_SYS_DFILE_COUNT | \
+ ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \
+ ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
+ ORANGEFS_ATTR_SYS_BLKSIZE)
+
+#define ORANGEFS_XATTR_REPLACE 0x2
+#define ORANGEFS_XATTR_CREATE 0x1
+#define ORANGEFS_MAX_SERVER_ADDR_LEN 256
+#define ORANGEFS_NAME_MAX 256
+/*
+ * max extended attribute name len as imposed by the VFS and exploited for the
+ * upcall request types.
+ * NOTE: Please retain them as multiples of 8 even if you wish to change them
+ * This is *NECESSARY* for supporting 32 bit user-space binaries on a 64-bit
+ * kernel. Due to implementation within DBPF, this really needs to be
+ * ORANGEFS_NAME_MAX, which it was the same value as, but no reason to let it
+ * break if that changes in the future.
+ */
+#define ORANGEFS_MAX_XATTR_NAMELEN ORANGEFS_NAME_MAX /* Not the same as
+ * XATTR_NAME_MAX defined
+ * by <linux/xattr.h>
+ */
+#define ORANGEFS_MAX_XATTR_VALUELEN 8192 /* Not the same as XATTR_SIZE_MAX
+ * defined by <linux/xattr.h>
+ */
+#define ORANGEFS_MAX_XATTR_LISTLEN 16 /* Not the same as XATTR_LIST_MAX
+ * defined by <linux/xattr.h>
+ */
+/*
+ * ORANGEFS I/O operation types, used in both system and server interfaces.
+ */
+enum ORANGEFS_io_type {
+ ORANGEFS_IO_READ = 1,
+ ORANGEFS_IO_WRITE = 2
+};
+
+/*
+ * If this enum is modified the server parameters related to the precreate pool
+ * batch and low threshold sizes may need to be modified to reflect this
+ * change.
+ */
+enum orangefs_ds_type {
+ ORANGEFS_TYPE_NONE = 0,
+ ORANGEFS_TYPE_METAFILE = (1 << 0),
+ ORANGEFS_TYPE_DATAFILE = (1 << 1),
+ ORANGEFS_TYPE_DIRECTORY = (1 << 2),
+ ORANGEFS_TYPE_SYMLINK = (1 << 3),
+ ORANGEFS_TYPE_DIRDATA = (1 << 4),
+ ORANGEFS_TYPE_INTERNAL = (1 << 5) /* for the server's private use */
+};
+
+/*
+ * ORANGEFS_certificate simply stores a buffer with the buffer size.
+ * The buffer can be converted to an OpenSSL X509 struct for use.
+ */
+struct ORANGEFS_certificate {
+ __u32 buf_size;
+ unsigned char *buf;
+};
+
+/*
+ * A credential identifies a user and is signed by the client/user
+ * private key.
+ */
+struct ORANGEFS_credential {
+ __u32 userid; /* user id */
+ __u32 num_groups; /* length of group_array */
+ __u32 *group_array; /* groups for which the user is a member */
+ char *issuer; /* alias of the issuing server */
+ __u64 timeout; /* seconds after epoch to time out */
+ __u32 sig_size; /* length of the signature in bytes */
+ unsigned char *signature; /* digital signature */
+ struct ORANGEFS_certificate certificate; /* user certificate buffer */
+};
+#define extra_size_ORANGEFS_credential (ORANGEFS_REQ_LIMIT_GROUPS * \
+ sizeof(__u32) + \
+ ORANGEFS_REQ_LIMIT_ISSUER + \
+ ORANGEFS_REQ_LIMIT_SIGNATURE + \
+ extra_size_ORANGEFS_certificate)
+
+/* This structure is used by the VFS-client interaction alone */
+struct ORANGEFS_keyval_pair {
+ char key[ORANGEFS_MAX_XATTR_NAMELEN];
+ __s32 key_sz; /* __s32 for portable, fixed-size structures */
+ __s32 val_sz;
+ char val[ORANGEFS_MAX_XATTR_VALUELEN];
+};
+
+/* pvfs2-sysint.h ***********************************************************/
+/* Describes attributes for a file, directory, or symlink. */
+struct ORANGEFS_sys_attr_s {
+ __u32 owner;
+ __u32 group;
+ __u32 perms;
+ __u64 atime;
+ __u64 mtime;
+ __u64 ctime;
+ __s64 size;
+
+ /* NOTE: caller must free if valid */
+ char *link_target;
+
+ /* Changed to __s32 so that size of structure does not change */
+ __s32 dfile_count;
+
+ /* Changed to __s32 so that size of structure does not change */
+ __s32 distr_dir_servers_initial;
+
+ /* Changed to __s32 so that size of structure does not change */
+ __s32 distr_dir_servers_max;
+
+ /* Changed to __s32 so that size of structure does not change */
+ __s32 distr_dir_split_size;
+
+ __u32 mirror_copies_count;
+
+ /* NOTE: caller must free if valid */
+ char *dist_name;
+
+ /* NOTE: caller must free if valid */
+ char *dist_params;
+
+ __s64 dirent_count;
+ enum orangefs_ds_type objtype;
+ __u64 flags;
+ __u32 mask;
+ __s64 blksize;
+};
+
+#define ORANGEFS_LOOKUP_LINK_NO_FOLLOW 0
+
+/* pint-dev.h ***************************************************************/
+
+/* parameter structure used in ORANGEFS_DEV_DEBUG ioctl command */
+struct dev_mask_info_s {
+ enum {
+ KERNEL_MASK,
+ CLIENT_MASK,
+ } mask_type;
+ __u64 mask_value;
+};
+
+struct dev_mask2_info_s {
+ __u64 mask1_value;
+ __u64 mask2_value;
+};
+
+/* pvfs2-util.h *************************************************************/
+__s32 ORANGEFS_util_translate_mode(int mode);
+
+/* pvfs2-debug.h ************************************************************/
+#include "orangefs-debug.h"
+
+/* pvfs2-internal.h *********************************************************/
+#define llu(x) (unsigned long long)(x)
+#define lld(x) (long long)(x)
+
+/* pint-dev-shared.h ********************************************************/
+#define ORANGEFS_DEV_MAGIC 'k'
+
+#define ORANGEFS_READDIR_DEFAULT_DESC_COUNT 5
+
+#define DEV_GET_MAGIC 0x1
+#define DEV_GET_MAX_UPSIZE 0x2
+#define DEV_GET_MAX_DOWNSIZE 0x3
+#define DEV_MAP 0x4
+#define DEV_REMOUNT_ALL 0x5
+#define DEV_DEBUG 0x6
+#define DEV_UPSTREAM 0x7
+#define DEV_CLIENT_MASK 0x8
+#define DEV_CLIENT_STRING 0x9
+#define DEV_MAX_NR 0xa
+
+/* supported ioctls, codes are with respect to user-space */
+enum {
+ ORANGEFS_DEV_GET_MAGIC = _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAGIC, __s32),
+ ORANGEFS_DEV_GET_MAX_UPSIZE =
+ _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAX_UPSIZE, __s32),
+ ORANGEFS_DEV_GET_MAX_DOWNSIZE =
+ _IOW(ORANGEFS_DEV_MAGIC, DEV_GET_MAX_DOWNSIZE, __s32),
+ ORANGEFS_DEV_MAP = _IO(ORANGEFS_DEV_MAGIC, DEV_MAP),
+ ORANGEFS_DEV_REMOUNT_ALL = _IO(ORANGEFS_DEV_MAGIC, DEV_REMOUNT_ALL),
+ ORANGEFS_DEV_DEBUG = _IOR(ORANGEFS_DEV_MAGIC, DEV_DEBUG, __s32),
+ ORANGEFS_DEV_UPSTREAM = _IOW(ORANGEFS_DEV_MAGIC, DEV_UPSTREAM, int),
+ ORANGEFS_DEV_CLIENT_MASK = _IOW(ORANGEFS_DEV_MAGIC,
+ DEV_CLIENT_MASK,
+ struct dev_mask2_info_s),
+ ORANGEFS_DEV_CLIENT_STRING = _IOW(ORANGEFS_DEV_MAGIC,
+ DEV_CLIENT_STRING,
+ char *),
+ ORANGEFS_DEV_MAXNR = DEV_MAX_NR,
+};
+
+/*
+ * version number for use in communicating between kernel space and user
+ * space. Zero signifies the upstream version of the kernel module.
+ */
+#define ORANGEFS_KERNEL_PROTO_VERSION 0
+#define ORANGEFS_MINIMUM_USERSPACE_VERSION 20903
+
+/*
+ * describes memory regions to map in the ORANGEFS_DEV_MAP ioctl.
+ * NOTE: See devorangefs-req.c for 32 bit compat structure.
+ * Since this structure has a variable-sized layout that is different
+ * on 32 and 64 bit platforms, we need to normalize to a 64 bit layout
+ * on such systems before servicing ioctl calls from user-space binaries
+ * that may be 32 bit!
+ */
+struct ORANGEFS_dev_map_desc {
+ void *ptr;
+ __s32 total_size;
+ __s32 size;
+ __s32 count;
+};
+
+/* gossip.h *****************************************************************/
+
+#ifdef GOSSIP_DISABLE_DEBUG
+#define gossip_debug(mask, fmt, ...) \
+do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+} while (0)
+#else
+extern __u64 gossip_debug_mask;
+extern struct client_debug_mask client_debug_mask;
+
+/* try to avoid function call overhead by checking masks in macro */
+#define gossip_debug(mask, fmt, ...) \
+do { \
+ if (gossip_debug_mask & (mask)) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+} while (0)
+#endif /* GOSSIP_DISABLE_DEBUG */
+
+/* do file and line number printouts w/ the GNU preprocessor */
+#define gossip_ldebug(mask, fmt, ...) \
+ gossip_debug(mask, "%s: " fmt, __func__, ##__VA_ARGS__)
+
+#define gossip_err pr_err
+#define gossip_lerr(fmt, ...) \
+ gossip_err("%s line %d: " fmt, \
+ __FILE__, __LINE__, ##__VA_ARGS__)
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
new file mode 100644
index 000000000000..b9da9a0281c9
--- /dev/null
+++ b/fs/orangefs/super.c
@@ -0,0 +1,559 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+#include <linux/parser.h>
+
+/* a cache for orangefs-inode objects (i.e. orangefs inode private data) */
+static struct kmem_cache *orangefs_inode_cache;
+
+/* list for storing orangefs specific superblocks in use */
+LIST_HEAD(orangefs_superblocks);
+
+DEFINE_SPINLOCK(orangefs_superblocks_lock);
+
+enum {
+ Opt_intr,
+ Opt_acl,
+ Opt_local_lock,
+
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ { Opt_acl, "acl" },
+ { Opt_intr, "intr" },
+ { Opt_local_lock, "local_lock" },
+ { Opt_err, NULL }
+};
+
+
+static int parse_mount_options(struct super_block *sb, char *options,
+ int silent)
+{
+ struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb);
+ substring_t args[MAX_OPT_ARGS];
+ char *p;
+
+ /*
+ * Force any potential flags that might be set from the mount
+ * to zero, ie, initialize to unset.
+ */
+ sb->s_flags &= ~MS_POSIXACL;
+ orangefs_sb->flags &= ~ORANGEFS_OPT_INTR;
+ orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_acl:
+ sb->s_flags |= MS_POSIXACL;
+ break;
+ case Opt_intr:
+ orangefs_sb->flags |= ORANGEFS_OPT_INTR;
+ break;
+ case Opt_local_lock:
+ orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK;
+ break;
+ default:
+ goto fail;
+ }
+ }
+
+ return 0;
+fail:
+ if (!silent)
+ gossip_err("Error: mount option [%s] is not supported.\n", p);
+ return -EINVAL;
+}
+
+static void orangefs_inode_cache_ctor(void *req)
+{
+ struct orangefs_inode_s *orangefs_inode = req;
+
+ inode_init_once(&orangefs_inode->vfs_inode);
+ init_rwsem(&orangefs_inode->xattr_sem);
+
+ orangefs_inode->vfs_inode.i_version = 1;
+}
+
+static struct inode *orangefs_alloc_inode(struct super_block *sb)
+{
+ struct orangefs_inode_s *orangefs_inode;
+
+ orangefs_inode = kmem_cache_alloc(orangefs_inode_cache, GFP_KERNEL);
+ if (orangefs_inode == NULL) {
+ gossip_err("Failed to allocate orangefs_inode\n");
+ return NULL;
+ }
+
+ /*
+ * We want to clear everything except for rw_semaphore and the
+ * vfs_inode.
+ */
+ memset(&orangefs_inode->refn.khandle, 0, 16);
+ orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL;
+ orangefs_inode->last_failed_block_index_read = 0;
+ memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target));
+ orangefs_inode->pinode_flags = 0;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_alloc_inode: allocated %p\n",
+ &orangefs_inode->vfs_inode);
+ return &orangefs_inode->vfs_inode;
+}
+
+static void orangefs_destroy_inode(struct inode *inode)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "%s: deallocated %p destroying inode %pU\n",
+ __func__, orangefs_inode, get_khandle_from_ino(inode));
+
+ kmem_cache_free(orangefs_inode_cache, orangefs_inode);
+}
+
+/*
+ * NOTE: information filled in here is typically reflected in the
+ * output of the system command 'df'
+*/
+static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ int ret = -ENOMEM;
+ struct orangefs_kernel_op_s *new_op = NULL;
+ int flags = 0;
+ struct super_block *sb = NULL;
+
+ sb = dentry->d_sb;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_statfs: called on sb %p (fs_id is %d)\n",
+ sb,
+ (int)(ORANGEFS_SB(sb)->fs_id));
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_STATFS);
+ if (!new_op)
+ return ret;
+ new_op->upcall.req.statfs.fs_id = ORANGEFS_SB(sb)->fs_id;
+
+ if (ORANGEFS_SB(sb)->flags & ORANGEFS_OPT_INTR)
+ flags = ORANGEFS_OP_INTERRUPTIBLE;
+
+ ret = service_operation(new_op, "orangefs_statfs", flags);
+
+ if (new_op->downcall.status < 0)
+ goto out_op_release;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "%s: got %ld blocks available | "
+ "%ld blocks total | %ld block size | "
+ "%ld files total | %ld files avail\n",
+ __func__,
+ (long)new_op->downcall.resp.statfs.blocks_avail,
+ (long)new_op->downcall.resp.statfs.blocks_total,
+ (long)new_op->downcall.resp.statfs.block_size,
+ (long)new_op->downcall.resp.statfs.files_total,
+ (long)new_op->downcall.resp.statfs.files_avail);
+
+ buf->f_type = sb->s_magic;
+ memcpy(&buf->f_fsid, &ORANGEFS_SB(sb)->fs_id, sizeof(buf->f_fsid));
+ buf->f_bsize = new_op->downcall.resp.statfs.block_size;
+ buf->f_namelen = ORANGEFS_NAME_MAX;
+
+ buf->f_blocks = (sector_t) new_op->downcall.resp.statfs.blocks_total;
+ buf->f_bfree = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
+ buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail;
+ buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total;
+ buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail;
+ buf->f_frsize = sb->s_blocksize;
+
+out_op_release:
+ op_release(new_op);
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_statfs: returning %d\n", ret);
+ return ret;
+}
+
+/*
+ * Remount as initiated by VFS layer. We just need to reparse the mount
+ * options, no need to signal pvfs2-client-core about it.
+ */
+static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n");
+ return parse_mount_options(sb, data, 1);
+}
+
+/*
+ * Remount as initiated by pvfs2-client-core on restart. This is used to
+ * repopulate mount information left from previous pvfs2-client-core.
+ *
+ * the idea here is that given a valid superblock, we're
+ * re-initializing the user space client with the initial mount
+ * information specified when the super block was first initialized.
+ * this is very different than the first initialization/creation of a
+ * superblock. we use the special service_priority_operation to make
+ * sure that the mount gets ahead of any other pending operation that
+ * is waiting for servicing. this means that the pvfs2-client won't
+ * fail to start several times for all other pending operations before
+ * the client regains all of the mount information from us.
+ * NOTE: this function assumes that the request_mutex is already acquired!
+ */
+int orangefs_remount(struct orangefs_sb_info_s *orangefs_sb)
+{
+ struct orangefs_kernel_op_s *new_op;
+ int ret = -EINVAL;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount: called\n");
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
+ if (!new_op)
+ return -ENOMEM;
+ strncpy(new_op->upcall.req.fs_mount.orangefs_config_server,
+ orangefs_sb->devname,
+ ORANGEFS_MAX_SERVER_ADDR_LEN);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Attempting ORANGEFS Remount via host %s\n",
+ new_op->upcall.req.fs_mount.orangefs_config_server);
+
+ /*
+ * we assume that the calling function has already acquired the
+ * request_mutex to prevent other operations from bypassing
+ * this one
+ */
+ ret = service_operation(new_op, "orangefs_remount",
+ ORANGEFS_OP_PRIORITY | ORANGEFS_OP_NO_MUTEX);
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_remount: mount got return value of %d\n",
+ ret);
+ if (ret == 0) {
+ /*
+ * store the id assigned to this sb -- it's just a
+ * short-lived mapping that the system interface uses
+ * to map this superblock to a particular mount entry
+ */
+ orangefs_sb->id = new_op->downcall.resp.fs_mount.id;
+ orangefs_sb->mount_pending = 0;
+ }
+
+ op_release(new_op);
+ return ret;
+}
+
+int fsid_key_table_initialize(void)
+{
+ return 0;
+}
+
+void fsid_key_table_finalize(void)
+{
+}
+
+/* Called whenever the VFS dirties the inode in response to atime updates */
+static void orangefs_dirty_inode(struct inode *inode, int flags)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_dirty_inode: %pU\n",
+ get_khandle_from_ino(inode));
+ SetAtimeFlag(orangefs_inode);
+}
+
+static const struct super_operations orangefs_s_ops = {
+ .alloc_inode = orangefs_alloc_inode,
+ .destroy_inode = orangefs_destroy_inode,
+ .dirty_inode = orangefs_dirty_inode,
+ .drop_inode = generic_delete_inode,
+ .statfs = orangefs_statfs,
+ .remount_fs = orangefs_remount_fs,
+ .show_options = generic_show_options,
+};
+
+static struct dentry *orangefs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid,
+ int fh_len,
+ int fh_type)
+{
+ struct orangefs_object_kref refn;
+
+ if (fh_len < 5 || fh_type > 2)
+ return NULL;
+
+ ORANGEFS_khandle_from(&(refn.khandle), fid->raw, 16);
+ refn.fs_id = (u32) fid->raw[4];
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "fh_to_dentry: handle %pU, fs_id %d\n",
+ &refn.khandle,
+ refn.fs_id);
+
+ return d_obtain_alias(orangefs_iget(sb, &refn));
+}
+
+static int orangefs_encode_fh(struct inode *inode,
+ __u32 *fh,
+ int *max_len,
+ struct inode *parent)
+{
+ int len = parent ? 10 : 5;
+ int type = 1;
+ struct orangefs_object_kref refn;
+
+ if (*max_len < len) {
+ gossip_lerr("fh buffer is too small for encoding\n");
+ *max_len = len;
+ type = 255;
+ goto out;
+ }
+
+ refn = ORANGEFS_I(inode)->refn;
+ ORANGEFS_khandle_to(&refn.khandle, fh, 16);
+ fh[4] = refn.fs_id;
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Encoding fh: handle %pU, fsid %u\n",
+ &refn.khandle,
+ refn.fs_id);
+
+
+ if (parent) {
+ refn = ORANGEFS_I(parent)->refn;
+ ORANGEFS_khandle_to(&refn.khandle, (char *) fh + 20, 16);
+ fh[9] = refn.fs_id;
+
+ type = 2;
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Encoding parent: handle %pU, fsid %u\n",
+ &refn.khandle,
+ refn.fs_id);
+ }
+ *max_len = len;
+
+out:
+ return type;
+}
+
+static const struct export_operations orangefs_export_ops = {
+ .encode_fh = orangefs_encode_fh,
+ .fh_to_dentry = orangefs_fh_to_dentry,
+};
+
+static int orangefs_fill_sb(struct super_block *sb,
+ struct orangefs_fs_mount_response *fs_mount,
+ void *data, int silent)
+{
+ int ret = -EINVAL;
+ struct inode *root = NULL;
+ struct dentry *root_dentry = NULL;
+ struct orangefs_object_kref root_object;
+
+ /* alloc and init our private orangefs sb info */
+ sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
+ if (!ORANGEFS_SB(sb))
+ return -ENOMEM;
+ ORANGEFS_SB(sb)->sb = sb;
+
+ ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle;
+ ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id;
+ ORANGEFS_SB(sb)->id = fs_mount->id;
+
+ if (data) {
+ ret = parse_mount_options(sb, data, silent);
+ if (ret)
+ return ret;
+ }
+
+ /* Hang the xattr handlers off the superblock */
+ sb->s_xattr = orangefs_xattr_handlers;
+ sb->s_magic = ORANGEFS_SUPER_MAGIC;
+ sb->s_op = &orangefs_s_ops;
+ sb->s_d_op = &orangefs_dentry_operations;
+
+ sb->s_blocksize = orangefs_bufmap_size_query();
+ sb->s_blocksize_bits = orangefs_bufmap_shift_query();
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+ root_object.khandle = ORANGEFS_SB(sb)->root_khandle;
+ root_object.fs_id = ORANGEFS_SB(sb)->fs_id;
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "get inode %pU, fsid %d\n",
+ &root_object.khandle,
+ root_object.fs_id);
+
+ root = orangefs_iget(sb, &root_object);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Allocated root inode [%p] with mode %x\n",
+ root,
+ root->i_mode);
+
+ /* allocates and places root dentry in dcache */
+ root_dentry = d_make_root(root);
+ if (!root_dentry)
+ return -ENOMEM;
+
+ sb->s_export_op = &orangefs_export_ops;
+ sb->s_root = root_dentry;
+ return 0;
+}
+
+struct dentry *orangefs_mount(struct file_system_type *fst,
+ int flags,
+ const char *devname,
+ void *data)
+{
+ int ret = -EINVAL;
+ struct super_block *sb = ERR_PTR(-EINVAL);
+ struct orangefs_kernel_op_s *new_op;
+ struct dentry *d = ERR_PTR(-EINVAL);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_mount: called with devname %s\n",
+ devname);
+
+ if (!devname) {
+ gossip_err("ERROR: device name not specified.\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
+ if (!new_op)
+ return ERR_PTR(-ENOMEM);
+
+ strncpy(new_op->upcall.req.fs_mount.orangefs_config_server,
+ devname,
+ ORANGEFS_MAX_SERVER_ADDR_LEN);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Attempting ORANGEFS Mount via host %s\n",
+ new_op->upcall.req.fs_mount.orangefs_config_server);
+
+ ret = service_operation(new_op, "orangefs_mount", 0);
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "orangefs_mount: mount got return value of %d\n", ret);
+ if (ret)
+ goto free_op;
+
+ if (new_op->downcall.resp.fs_mount.fs_id == ORANGEFS_FS_ID_NULL) {
+ gossip_err("ERROR: Retrieved null fs_id\n");
+ ret = -EINVAL;
+ goto free_op;
+ }
+
+ sb = sget(fst, NULL, set_anon_super, flags, NULL);
+
+ if (IS_ERR(sb)) {
+ d = ERR_CAST(sb);
+ goto free_op;
+ }
+
+ ret = orangefs_fill_sb(sb,
+ &new_op->downcall.resp.fs_mount, data,
+ flags & MS_SILENT ? 1 : 0);
+
+ if (ret) {
+ d = ERR_PTR(ret);
+ goto free_op;
+ }
+
+ /*
+ * on successful mount, store the devname and data
+ * used
+ */
+ strncpy(ORANGEFS_SB(sb)->devname,
+ devname,
+ ORANGEFS_MAX_SERVER_ADDR_LEN);
+
+ /* mount_pending must be cleared */
+ ORANGEFS_SB(sb)->mount_pending = 0;
+
+ /*
+ * finally, add this sb to our list of known orangefs
+ * sb's
+ */
+ gossip_debug(GOSSIP_SUPER_DEBUG,
+ "Adding SB %p to orangefs superblocks\n",
+ ORANGEFS_SB(sb));
+ spin_lock(&orangefs_superblocks_lock);
+ list_add_tail(&ORANGEFS_SB(sb)->list, &orangefs_superblocks);
+ spin_unlock(&orangefs_superblocks_lock);
+ op_release(new_op);
+ return dget(sb->s_root);
+
+free_op:
+ gossip_err("orangefs_mount: mount request failed with %d\n", ret);
+ if (ret == -EINVAL) {
+ gossip_err("Ensure that all orangefs-servers have the same FS configuration files\n");
+ gossip_err("Look at pvfs2-client-core log file (typically /tmp/pvfs2-client.log) for more details\n");
+ }
+
+ op_release(new_op);
+
+ return d;
+}
+
+void orangefs_kill_sb(struct super_block *sb)
+{
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n");
+
+ /* provided sb cleanup */
+ kill_anon_super(sb);
+
+ /*
+ * issue the unmount to userspace to tell it to remove the
+ * dynamic mount info it has for this superblock
+ */
+ orangefs_unmount_sb(sb);
+
+ /* remove the sb from our list of orangefs specific sb's */
+
+ spin_lock(&orangefs_superblocks_lock);
+ __list_del_entry(&ORANGEFS_SB(sb)->list); /* not list_del_init */
+ ORANGEFS_SB(sb)->list.prev = NULL;
+ spin_unlock(&orangefs_superblocks_lock);
+
+ /*
+ * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us
+ * gets completed before we free the dang thing.
+ */
+ mutex_lock(&request_mutex);
+ mutex_unlock(&request_mutex);
+
+ /* free the orangefs superblock private data */
+ kfree(ORANGEFS_SB(sb));
+}
+
+int orangefs_inode_cache_initialize(void)
+{
+ orangefs_inode_cache = kmem_cache_create("orangefs_inode_cache",
+ sizeof(struct orangefs_inode_s),
+ 0,
+ ORANGEFS_CACHE_CREATE_FLAGS,
+ orangefs_inode_cache_ctor);
+
+ if (!orangefs_inode_cache) {
+ gossip_err("Cannot create orangefs_inode_cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int orangefs_inode_cache_finalize(void)
+{
+ kmem_cache_destroy(orangefs_inode_cache);
+ return 0;
+}
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
new file mode 100644
index 000000000000..6418dd638680
--- /dev/null
+++ b/fs/orangefs/symlink.c
@@ -0,0 +1,19 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+struct inode_operations orangefs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .get_link = simple_get_link,
+ .setattr = orangefs_setattr,
+ .getattr = orangefs_getattr,
+ .listxattr = orangefs_listxattr,
+ .setxattr = generic_setxattr,
+ .permission = orangefs_permission,
+};
diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h
new file mode 100644
index 000000000000..001b20239407
--- /dev/null
+++ b/fs/orangefs/upcall.h
@@ -0,0 +1,246 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+#ifndef __UPCALL_H
+#define __UPCALL_H
+
+/*
+ * Sanitized this header file to fix
+ * 32-64 bit interaction issues between
+ * client-core and device
+ */
+struct orangefs_io_request_s {
+ __s32 __pad1;
+ __s32 buf_index;
+ __s32 count;
+ __s32 __pad2;
+ __s64 offset;
+ struct orangefs_object_kref refn;
+ enum ORANGEFS_io_type io_type;
+ __s32 readahead_size;
+};
+
+struct orangefs_lookup_request_s {
+ __s32 sym_follow;
+ __s32 __pad1;
+ struct orangefs_object_kref parent_refn;
+ char d_name[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_create_request_s {
+ struct orangefs_object_kref parent_refn;
+ struct ORANGEFS_sys_attr_s attributes;
+ char d_name[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_symlink_request_s {
+ struct orangefs_object_kref parent_refn;
+ struct ORANGEFS_sys_attr_s attributes;
+ char entry_name[ORANGEFS_NAME_MAX];
+ char target[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_getattr_request_s {
+ struct orangefs_object_kref refn;
+ __u32 mask;
+ __u32 __pad1;
+};
+
+struct orangefs_setattr_request_s {
+ struct orangefs_object_kref refn;
+ struct ORANGEFS_sys_attr_s attributes;
+};
+
+struct orangefs_remove_request_s {
+ struct orangefs_object_kref parent_refn;
+ char d_name[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_mkdir_request_s {
+ struct orangefs_object_kref parent_refn;
+ struct ORANGEFS_sys_attr_s attributes;
+ char d_name[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_readdir_request_s {
+ struct orangefs_object_kref refn;
+ __u64 token;
+ __s32 max_dirent_count;
+ __s32 buf_index;
+};
+
+struct orangefs_readdirplus_request_s {
+ struct orangefs_object_kref refn;
+ __u64 token;
+ __s32 max_dirent_count;
+ __u32 mask;
+ __s32 buf_index;
+ __s32 __pad1;
+};
+
+struct orangefs_rename_request_s {
+ struct orangefs_object_kref old_parent_refn;
+ struct orangefs_object_kref new_parent_refn;
+ char d_old_name[ORANGEFS_NAME_MAX];
+ char d_new_name[ORANGEFS_NAME_MAX];
+};
+
+struct orangefs_statfs_request_s {
+ __s32 fs_id;
+ __s32 __pad1;
+};
+
+struct orangefs_truncate_request_s {
+ struct orangefs_object_kref refn;
+ __s64 size;
+};
+
+struct orangefs_mmap_ra_cache_flush_request_s {
+ struct orangefs_object_kref refn;
+};
+
+struct orangefs_fs_mount_request_s {
+ char orangefs_config_server[ORANGEFS_MAX_SERVER_ADDR_LEN];
+};
+
+struct orangefs_fs_umount_request_s {
+ __s32 id;
+ __s32 fs_id;
+ char orangefs_config_server[ORANGEFS_MAX_SERVER_ADDR_LEN];
+};
+
+struct orangefs_getxattr_request_s {
+ struct orangefs_object_kref refn;
+ __s32 key_sz;
+ __s32 __pad1;
+ char key[ORANGEFS_MAX_XATTR_NAMELEN];
+};
+
+struct orangefs_setxattr_request_s {
+ struct orangefs_object_kref refn;
+ struct ORANGEFS_keyval_pair keyval;
+ __s32 flags;
+ __s32 __pad1;
+};
+
+struct orangefs_listxattr_request_s {
+ struct orangefs_object_kref refn;
+ __s32 requested_count;
+ __s32 __pad1;
+ __u64 token;
+};
+
+struct orangefs_removexattr_request_s {
+ struct orangefs_object_kref refn;
+ __s32 key_sz;
+ __s32 __pad1;
+ char key[ORANGEFS_MAX_XATTR_NAMELEN];
+};
+
+struct orangefs_op_cancel_s {
+ __u64 op_tag;
+};
+
+struct orangefs_fsync_request_s {
+ struct orangefs_object_kref refn;
+};
+
+enum orangefs_param_request_type {
+ ORANGEFS_PARAM_REQUEST_SET = 1,
+ ORANGEFS_PARAM_REQUEST_GET = 2
+};
+
+enum orangefs_param_request_op {
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_TIMEOUT_MSECS = 1,
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_HARD_LIMIT = 2,
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_SOFT_LIMIT = 3,
+ ORANGEFS_PARAM_REQUEST_OP_ACACHE_RECLAIM_PERCENTAGE = 4,
+ ORANGEFS_PARAM_REQUEST_OP_PERF_TIME_INTERVAL_SECS = 5,
+ ORANGEFS_PARAM_REQUEST_OP_PERF_HISTORY_SIZE = 6,
+ ORANGEFS_PARAM_REQUEST_OP_PERF_RESET = 7,
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_TIMEOUT_MSECS = 8,
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_HARD_LIMIT = 9,
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_SOFT_LIMIT = 10,
+ ORANGEFS_PARAM_REQUEST_OP_NCACHE_RECLAIM_PERCENTAGE = 11,
+ ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_TIMEOUT_MSECS = 12,
+ ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_HARD_LIMIT = 13,
+ ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_SOFT_LIMIT = 14,
+ ORANGEFS_PARAM_REQUEST_OP_STATIC_ACACHE_RECLAIM_PERCENTAGE = 15,
+ ORANGEFS_PARAM_REQUEST_OP_CLIENT_DEBUG = 16,
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_TIMEOUT_SECS = 17,
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_HARD_LIMIT = 18,
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_SOFT_LIMIT = 19,
+ ORANGEFS_PARAM_REQUEST_OP_CCACHE_RECLAIM_PERCENTAGE = 20,
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_TIMEOUT_SECS = 21,
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_HARD_LIMIT = 22,
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23,
+ ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24,
+ ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25,
+};
+
+struct orangefs_param_request_s {
+ enum orangefs_param_request_type type;
+ enum orangefs_param_request_op op;
+ __s64 value;
+ char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN];
+};
+
+enum orangefs_perf_count_request_type {
+ ORANGEFS_PERF_COUNT_REQUEST_ACACHE = 1,
+ ORANGEFS_PERF_COUNT_REQUEST_NCACHE = 2,
+ ORANGEFS_PERF_COUNT_REQUEST_CAPCACHE = 3,
+};
+
+struct orangefs_perf_count_request_s {
+ enum orangefs_perf_count_request_type type;
+ __s32 __pad1;
+};
+
+struct orangefs_fs_key_request_s {
+ __s32 fsid;
+ __s32 __pad1;
+};
+
+struct orangefs_upcall_s {
+ __s32 type;
+ __u32 uid;
+ __u32 gid;
+ int pid;
+ int tgid;
+ /* Trailers unused but must be retained for protocol compatibility. */
+ __s64 trailer_size;
+ char *trailer_buf;
+
+ union {
+ struct orangefs_io_request_s io;
+ struct orangefs_lookup_request_s lookup;
+ struct orangefs_create_request_s create;
+ struct orangefs_symlink_request_s sym;
+ struct orangefs_getattr_request_s getattr;
+ struct orangefs_setattr_request_s setattr;
+ struct orangefs_remove_request_s remove;
+ struct orangefs_mkdir_request_s mkdir;
+ struct orangefs_readdir_request_s readdir;
+ struct orangefs_readdirplus_request_s readdirplus;
+ struct orangefs_rename_request_s rename;
+ struct orangefs_statfs_request_s statfs;
+ struct orangefs_truncate_request_s truncate;
+ struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush;
+ struct orangefs_fs_mount_request_s fs_mount;
+ struct orangefs_fs_umount_request_s fs_umount;
+ struct orangefs_getxattr_request_s getxattr;
+ struct orangefs_setxattr_request_s setxattr;
+ struct orangefs_listxattr_request_s listxattr;
+ struct orangefs_removexattr_request_s removexattr;
+ struct orangefs_op_cancel_s cancel;
+ struct orangefs_fsync_request_s fsync;
+ struct orangefs_param_request_s param;
+ struct orangefs_perf_count_request_s perf_count;
+ struct orangefs_fs_key_request_s fs_key;
+ } req;
+};
+
+#endif /* __UPCALL_H */
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
new file mode 100644
index 000000000000..31635bc303fe
--- /dev/null
+++ b/fs/orangefs/waitqueue.c
@@ -0,0 +1,357 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ * (C) 2011 Omnibond Systems
+ *
+ * Changes by Acxiom Corporation to implement generic service_operation()
+ * function, Copyright Acxiom Corporation, 2005.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * In-kernel waitqueue operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+
+static int wait_for_matching_downcall(struct orangefs_kernel_op_s *, long, bool);
+static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *);
+
+/*
+ * What we do in this function is to walk the list of operations that are
+ * present in the request queue and mark them as purged.
+ * NOTE: This is called from the device close after client-core has
+ * guaranteed that no new operations could appear on the list since the
+ * client-core is anyway going to exit.
+ */
+void purge_waiting_ops(void)
+{
+ struct orangefs_kernel_op_s *op;
+
+ spin_lock(&orangefs_request_list_lock);
+ list_for_each_entry(op, &orangefs_request_list, list) {
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "pvfs2-client-core: purging op tag %llu %s\n",
+ llu(op->tag),
+ get_opname_string(op));
+ set_op_state_purged(op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(op),
+ op->op_state,
+ current->comm);
+ }
+ spin_unlock(&orangefs_request_list_lock);
+}
+
+/*
+ * submits a ORANGEFS operation and waits for it to complete
+ *
+ * Note op->downcall.status will contain the status of the operation (in
+ * errno format), whether provided by pvfs2-client or a result of failure to
+ * service the operation. If the caller wishes to distinguish, then
+ * op->state can be checked to see if it was serviced or not.
+ *
+ * Returns contents of op->downcall.status for convenience
+ */
+int service_operation(struct orangefs_kernel_op_s *op,
+ const char *op_name,
+ int flags)
+{
+ long timeout = MAX_SCHEDULE_TIMEOUT;
+ int ret = 0;
+
+ DEFINE_WAIT(wait_entry);
+
+ op->upcall.tgid = current->tgid;
+ op->upcall.pid = current->pid;
+
+retry_servicing:
+ op->downcall.status = 0;
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: %s op:%p: process:%s: pid:%d:\n",
+ __func__,
+ op_name,
+ op,
+ current->comm,
+ current->pid);
+
+ /*
+ * If ORANGEFS_OP_NO_MUTEX was set in flags, we need to avoid
+ * acquiring the request_mutex because we're servicing a
+ * high priority remount operation and the request_mutex is
+ * already taken.
+ */
+ if (!(flags & ORANGEFS_OP_NO_MUTEX)) {
+ if (flags & ORANGEFS_OP_INTERRUPTIBLE)
+ ret = mutex_lock_interruptible(&request_mutex);
+ else
+ ret = mutex_lock_killable(&request_mutex);
+ /*
+ * check to see if we were interrupted while waiting for
+ * mutex
+ */
+ if (ret < 0) {
+ op->downcall.status = ret;
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: service_operation interrupted.\n",
+ __func__);
+ return ret;
+ }
+ }
+
+ /* queue up the operation */
+ spin_lock(&orangefs_request_list_lock);
+ spin_lock(&op->lock);
+ set_op_state_waiting(op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(op),
+ op->op_state,
+ current->comm);
+ /* add high priority remount op to the front of the line. */
+ if (flags & ORANGEFS_OP_PRIORITY)
+ list_add(&op->list, &orangefs_request_list);
+ else
+ list_add_tail(&op->list, &orangefs_request_list);
+ spin_unlock(&op->lock);
+ wake_up_interruptible(&orangefs_request_list_waitq);
+ if (!__is_daemon_in_service()) {
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s:client core is NOT in service.\n",
+ __func__);
+ timeout = op_timeout_secs * HZ;
+ }
+ spin_unlock(&orangefs_request_list_lock);
+
+ if (!(flags & ORANGEFS_OP_NO_MUTEX))
+ mutex_unlock(&request_mutex);
+
+ ret = wait_for_matching_downcall(op, timeout,
+ flags & ORANGEFS_OP_INTERRUPTIBLE);
+
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: wait_for_matching_downcall returned %d for %p\n",
+ __func__,
+ ret,
+ op);
+
+ /* got matching downcall; make sure status is in errno format */
+ if (!ret) {
+ spin_unlock(&op->lock);
+ op->downcall.status =
+ orangefs_normalize_to_errno(op->downcall.status);
+ ret = op->downcall.status;
+ goto out;
+ }
+
+ /* failed to get matching downcall */
+ if (ret == -ETIMEDOUT) {
+ gossip_err("%s: %s -- wait timed out; aborting attempt.\n",
+ __func__,
+ op_name);
+ }
+
+ /*
+ * remove a waiting op from the request list or
+ * remove an in-progress op from the in-progress list.
+ */
+ orangefs_clean_up_interrupted_operation(op);
+
+ op->downcall.status = ret;
+ /* retry if operation has not been serviced and if requested */
+ if (ret == -EAGAIN) {
+ op->attempts++;
+ timeout = op_timeout_secs * HZ;
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "orangefs: tag %llu (%s)"
+ " -- operation to be retried (%d attempt)\n",
+ llu(op->tag),
+ op_name,
+ op->attempts);
+
+ /*
+ * io ops (ops that use the shared memory buffer) have
+ * to be returned to their caller for a retry. Other ops
+ * can just be recycled here.
+ */
+ if (!op->uses_shared_memory)
+ goto retry_servicing;
+ }
+
+out:
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: %s returning: %d for %p.\n",
+ __func__,
+ op_name,
+ ret,
+ op);
+ return ret;
+}
+
+/* This can get called on an I/O op if it had a bad service_operation. */
+bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op)
+{
+ u64 tag = op->tag;
+ if (!op_state_in_progress(op))
+ return false;
+
+ op->slot_to_free = op->upcall.req.io.buf_index;
+ memset(&op->upcall, 0, sizeof(op->upcall));
+ memset(&op->downcall, 0, sizeof(op->downcall));
+ op->upcall.type = ORANGEFS_VFS_OP_CANCEL;
+ op->upcall.req.cancel.op_tag = tag;
+ op->downcall.type = ORANGEFS_VFS_OP_INVALID;
+ op->downcall.status = -1;
+ orangefs_new_tag(op);
+
+ spin_lock(&orangefs_request_list_lock);
+ /* orangefs_request_list_lock is enough of a barrier here */
+ if (!__is_daemon_in_service()) {
+ spin_unlock(&orangefs_request_list_lock);
+ return false;
+ }
+ spin_lock(&op->lock);
+ set_op_state_waiting(op);
+ gossip_debug(GOSSIP_DEV_DEBUG,
+ "%s: op:%s: op_state:%d: process:%s:\n",
+ __func__,
+ get_opname_string(op),
+ op->op_state,
+ current->comm);
+ list_add(&op->list, &orangefs_request_list);
+ spin_unlock(&op->lock);
+ spin_unlock(&orangefs_request_list_lock);
+
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "Attempting ORANGEFS operation cancellation of tag %llu\n",
+ llu(tag));
+ return true;
+}
+
+/*
+ * Change an op to the "given up" state and remove it from its list.
+ */
+static void
+ orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op)
+{
+ /*
+ * handle interrupted cases depending on what state we were in when
+ * the interruption is detected.
+ *
+ * Called with op->lock held.
+ */
+
+ /*
+ * List manipulation code elsewhere will ignore ops that
+ * have been given up upon.
+ */
+ op->op_state |= OP_VFS_STATE_GIVEN_UP;
+
+ if (list_empty(&op->list)) {
+ /* caught copying to/from daemon */
+ BUG_ON(op_state_serviced(op));
+ spin_unlock(&op->lock);
+ wait_for_completion(&op->waitq);
+ } else if (op_state_waiting(op)) {
+ /*
+ * upcall hasn't been read; remove op from upcall request
+ * list.
+ */
+ spin_unlock(&op->lock);
+ spin_lock(&orangefs_request_list_lock);
+ list_del_init(&op->list);
+ spin_unlock(&orangefs_request_list_lock);
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "Interrupted: Removed op %p from request_list\n",
+ op);
+ } else if (op_state_in_progress(op)) {
+ /* op must be removed from the in progress htable */
+ spin_unlock(&op->lock);
+ spin_lock(&htable_ops_in_progress_lock);
+ list_del_init(&op->list);
+ spin_unlock(&htable_ops_in_progress_lock);
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "Interrupted: Removed op %p"
+ " from htable_ops_in_progress\n",
+ op);
+ } else {
+ spin_unlock(&op->lock);
+ gossip_err("interrupted operation is in a weird state 0x%x\n",
+ op->op_state);
+ }
+ reinit_completion(&op->waitq);
+}
+
+/*
+ * Sleeps on waitqueue waiting for matching downcall.
+ * If client-core finishes servicing, then we are good to go.
+ * else if client-core exits, we get woken up here, and retry with a timeout
+ *
+ * When this call returns to the caller, the specified op will no
+ * longer be in either the in_progress hash table or on the request list.
+ *
+ * Returns 0 on success and -errno on failure
+ * Errors are:
+ * EAGAIN in case we want the caller to requeue and try again..
+ * EINTR/EIO/ETIMEDOUT indicating we are done trying to service this
+ * operation since client-core seems to be exiting too often
+ * or if we were interrupted.
+ *
+ * Returns with op->lock taken.
+ */
+static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
+ long timeout,
+ bool interruptible)
+{
+ long n;
+
+ /*
+ * There's a "schedule_timeout" inside of these wait
+ * primitives, during which the op is out of the hands of the
+ * user process that needs something done and is being
+ * manipulated by the client-core process.
+ */
+ if (interruptible)
+ n = wait_for_completion_interruptible_timeout(&op->waitq,
+ timeout);
+ else
+ n = wait_for_completion_killable_timeout(&op->waitq, timeout);
+
+ spin_lock(&op->lock);
+
+ if (op_state_serviced(op))
+ return 0;
+
+ if (unlikely(n < 0)) {
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: operation interrupted, tag %llu, %p\n",
+ __func__,
+ llu(op->tag),
+ op);
+ return -EINTR;
+ }
+ if (op_state_purged(op)) {
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: operation purged, tag %llu, %p, %d\n",
+ __func__,
+ llu(op->tag),
+ op,
+ op->attempts);
+ return (op->attempts < ORANGEFS_PURGE_RETRY_COUNT) ?
+ -EAGAIN :
+ -EIO;
+ }
+ /* must have timed out, then... */
+ gossip_debug(GOSSIP_WAIT_DEBUG,
+ "%s: operation timed out, tag %llu, %p, %d)\n",
+ __func__,
+ llu(op->tag),
+ op,
+ op->attempts);
+ return -ETIMEDOUT;
+}
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
new file mode 100644
index 000000000000..63a6280d8c3a
--- /dev/null
+++ b/fs/orangefs/xattr.c
@@ -0,0 +1,530 @@
+/*
+ * (C) 2001 Clemson University and The University of Chicago
+ *
+ * See COPYING in top-level directory.
+ */
+
+/*
+ * Linux VFS extended attribute operations.
+ */
+
+#include "protocol.h"
+#include "orangefs-kernel.h"
+#include "orangefs-bufmap.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+
+#define SYSTEM_ORANGEFS_KEY "system.pvfs2."
+#define SYSTEM_ORANGEFS_KEY_LEN 13
+
+/*
+ * this function returns
+ * 0 if the key corresponding to name is not meant to be printed as part
+ * of a listxattr.
+ * 1 if the key corresponding to name is meant to be returned as part of
+ * a listxattr.
+ * The ones that start SYSTEM_ORANGEFS_KEY are the ones to avoid printing.
+ */
+static int is_reserved_key(const char *key, size_t size)
+{
+
+ if (size < SYSTEM_ORANGEFS_KEY_LEN)
+ return 1;
+
+ return strncmp(key, SYSTEM_ORANGEFS_KEY, SYSTEM_ORANGEFS_KEY_LEN) ? 1 : 0;
+}
+
+static inline int convert_to_internal_xattr_flags(int setxattr_flags)
+{
+ int internal_flag = 0;
+
+ if (setxattr_flags & XATTR_REPLACE) {
+ /* Attribute must exist! */
+ internal_flag = ORANGEFS_XATTR_REPLACE;
+ } else if (setxattr_flags & XATTR_CREATE) {
+ /* Attribute must not exist */
+ internal_flag = ORANGEFS_XATTR_CREATE;
+ }
+ return internal_flag;
+}
+
+
+/*
+ * Tries to get a specified key's attributes of a given
+ * file into a user-specified buffer. Note that the getxattr
+ * interface allows for the users to probe the size of an
+ * extended attribute by passing in a value of 0 to size.
+ * Thus our return value is always the size of the attribute
+ * unless the key does not exist for the file and/or if
+ * there were errors in fetching the attribute value.
+ */
+ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix,
+ const char *name, void *buffer, size_t size)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op = NULL;
+ ssize_t ret = -ENOMEM;
+ ssize_t length = 0;
+ int fsuid;
+ int fsgid;
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "%s: prefix %s name %s, buffer_size %zd\n",
+ __func__, prefix, name, size);
+
+ if ((strlen(name) + strlen(prefix)) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+ gossip_err("Invalid key length (%d)\n",
+ (int)(strlen(name) + strlen(prefix)));
+ return -EINVAL;
+ }
+
+ fsuid = from_kuid(current_user_ns(), current_fsuid());
+ fsgid = from_kgid(current_user_ns(), current_fsgid());
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "getxattr on inode %pU, name %s "
+ "(uid %o, gid %o)\n",
+ get_khandle_from_ino(inode),
+ name,
+ fsuid,
+ fsgid);
+
+ down_read(&orangefs_inode->xattr_sem);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_GETXATTR);
+ if (!new_op)
+ goto out_unlock;
+
+ new_op->upcall.req.getxattr.refn = orangefs_inode->refn;
+ ret = snprintf((char *)new_op->upcall.req.getxattr.key,
+ ORANGEFS_MAX_XATTR_NAMELEN, "%s%s", prefix, name);
+
+ /*
+ * NOTE: Although keys are meant to be NULL terminated textual
+ * strings, I am going to explicitly pass the length just in case
+ * we change this later on...
+ */
+ new_op->upcall.req.getxattr.key_sz = ret + 1;
+
+ ret = service_operation(new_op, "orangefs_inode_getxattr",
+ get_interruptible_flag(inode));
+ if (ret != 0) {
+ if (ret == -ENOENT) {
+ ret = -ENODATA;
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_getxattr: inode %pU key %s"
+ " does not exist!\n",
+ get_khandle_from_ino(inode),
+ (char *)new_op->upcall.req.getxattr.key);
+ }
+ goto out_release_op;
+ }
+
+ /*
+ * Length returned includes null terminator.
+ */
+ length = new_op->downcall.resp.getxattr.val_sz;
+
+ /*
+ * Just return the length of the queried attribute.
+ */
+ if (size == 0) {
+ ret = length;
+ goto out_release_op;
+ }
+
+ /*
+ * Check to see if key length is > provided buffer size.
+ */
+ if (length > size) {
+ ret = -ERANGE;
+ goto out_release_op;
+ }
+
+ memcpy(buffer, new_op->downcall.resp.getxattr.val, length);
+ memset(buffer + length, 0, size - length);
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_getxattr: inode %pU "
+ "key %s key_sz %d, val_len %d\n",
+ get_khandle_from_ino(inode),
+ (char *)new_op->
+ upcall.req.getxattr.key,
+ (int)new_op->
+ upcall.req.getxattr.key_sz,
+ (int)ret);
+
+ ret = length;
+
+out_release_op:
+ op_release(new_op);
+out_unlock:
+ up_read(&orangefs_inode->xattr_sem);
+ return ret;
+}
+
+static int orangefs_inode_removexattr(struct inode *inode,
+ const char *prefix,
+ const char *name,
+ int flags)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op = NULL;
+ int ret = -ENOMEM;
+
+ down_write(&orangefs_inode->xattr_sem);
+ new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR);
+ if (!new_op)
+ goto out_unlock;
+
+ new_op->upcall.req.removexattr.refn = orangefs_inode->refn;
+ /*
+ * NOTE: Although keys are meant to be NULL terminated
+ * textual strings, I am going to explicitly pass the
+ * length just in case we change this later on...
+ */
+ ret = snprintf((char *)new_op->upcall.req.removexattr.key,
+ ORANGEFS_MAX_XATTR_NAMELEN,
+ "%s%s",
+ (prefix ? prefix : ""),
+ name);
+ new_op->upcall.req.removexattr.key_sz = ret + 1;
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_removexattr: key %s, key_sz %d\n",
+ (char *)new_op->upcall.req.removexattr.key,
+ (int)new_op->upcall.req.removexattr.key_sz);
+
+ ret = service_operation(new_op,
+ "orangefs_inode_removexattr",
+ get_interruptible_flag(inode));
+ if (ret == -ENOENT) {
+ /*
+ * Request to replace a non-existent attribute is an error.
+ */
+ if (flags & XATTR_REPLACE)
+ ret = -ENODATA;
+ else
+ ret = 0;
+ }
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_removexattr: returning %d\n", ret);
+
+ op_release(new_op);
+out_unlock:
+ up_write(&orangefs_inode->xattr_sem);
+ return ret;
+}
+
+/*
+ * Tries to set an attribute for a given key on a file.
+ *
+ * Returns a -ve number on error and 0 on success. Key is text, but value
+ * can be binary!
+ */
+int orangefs_inode_setxattr(struct inode *inode, const char *prefix,
+ const char *name, const void *value, size_t size, int flags)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ int internal_flag = 0;
+ int ret = -ENOMEM;
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "%s: prefix %s, name %s, buffer_size %zd\n",
+ __func__, prefix, name, size);
+
+ if (size >= ORANGEFS_MAX_XATTR_VALUELEN ||
+ flags < 0) {
+ gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n",
+ (int)size,
+ flags);
+ return -EINVAL;
+ }
+
+ internal_flag = convert_to_internal_xattr_flags(flags);
+
+ if (prefix) {
+ if (strlen(name) + strlen(prefix) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+ gossip_err
+ ("orangefs_inode_setxattr: bogus key size (%d)\n",
+ (int)(strlen(name) + strlen(prefix)));
+ return -EINVAL;
+ }
+ } else {
+ if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
+ gossip_err
+ ("orangefs_inode_setxattr: bogus key size (%d)\n",
+ (int)(strlen(name)));
+ return -EINVAL;
+ }
+ }
+
+ /* This is equivalent to a removexattr */
+ if (size == 0 && value == NULL) {
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "removing xattr (%s%s)\n",
+ prefix,
+ name);
+ return orangefs_inode_removexattr(inode, prefix, name, flags);
+ }
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "setxattr on inode %pU, name %s\n",
+ get_khandle_from_ino(inode),
+ name);
+
+ down_write(&orangefs_inode->xattr_sem);
+ new_op = op_alloc(ORANGEFS_VFS_OP_SETXATTR);
+ if (!new_op)
+ goto out_unlock;
+
+
+ new_op->upcall.req.setxattr.refn = orangefs_inode->refn;
+ new_op->upcall.req.setxattr.flags = internal_flag;
+ /*
+ * NOTE: Although keys are meant to be NULL terminated textual
+ * strings, I am going to explicitly pass the length just in
+ * case we change this later on...
+ */
+ ret = snprintf((char *)new_op->upcall.req.setxattr.keyval.key,
+ ORANGEFS_MAX_XATTR_NAMELEN,
+ "%s%s",
+ prefix, name);
+ new_op->upcall.req.setxattr.keyval.key_sz = ret + 1;
+ memcpy(new_op->upcall.req.setxattr.keyval.val, value, size);
+ new_op->upcall.req.setxattr.keyval.val_sz = size;
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_setxattr: key %s, key_sz %d "
+ " value size %zd\n",
+ (char *)new_op->upcall.req.setxattr.keyval.key,
+ (int)new_op->upcall.req.setxattr.keyval.key_sz,
+ size);
+
+ ret = service_operation(new_op,
+ "orangefs_inode_setxattr",
+ get_interruptible_flag(inode));
+
+ gossip_debug(GOSSIP_XATTR_DEBUG,
+ "orangefs_inode_setxattr: returning %d\n",
+ ret);
+
+ /* when request is serviced properly, free req op struct */
+ op_release(new_op);
+out_unlock:
+ up_write(&orangefs_inode->xattr_sem);
+ return ret;
+}
+
+/*
+ * Tries to get a specified object's keys into a user-specified buffer of a
+ * given size. Note that like the previous instances of xattr routines, this
+ * also allows you to pass in a NULL pointer and 0 size to probe the size for
+ * subsequent memory allocations. Thus our return value is always the size of
+ * all the keys unless there were errors in fetching the keys!
+ */
+ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ __u64 token = ORANGEFS_ITERATE_START;
+ ssize_t ret = -ENOMEM;
+ ssize_t total = 0;
+ int count_keys = 0;
+ int key_size;
+ int i = 0;
+ int returned_count = 0;
+
+ if (size > 0 && buffer == NULL) {
+ gossip_err("%s: bogus NULL pointers\n", __func__);
+ return -EINVAL;
+ }
+
+ down_read(&orangefs_inode->xattr_sem);
+ new_op = op_alloc(ORANGEFS_VFS_OP_LISTXATTR);
+ if (!new_op)
+ goto out_unlock;
+
+ if (buffer && size > 0)
+ memset(buffer, 0, size);
+
+try_again:
+ key_size = 0;
+ new_op->upcall.req.listxattr.refn = orangefs_inode->refn;
+ new_op->upcall.req.listxattr.token = token;
+ new_op->upcall.req.listxattr.requested_count =
+ (size == 0) ? 0 : ORANGEFS_MAX_XATTR_LISTLEN;
+ ret = service_operation(new_op, __func__,
+ get_interruptible_flag(inode));
+ if (ret != 0)
+ goto done;
+
+ if (size == 0) {
+ /*
+ * This is a bit of a big upper limit, but I did not want to
+ * spend too much time getting this correct, since users end
+ * up allocating memory rather than us...
+ */
+ total = new_op->downcall.resp.listxattr.returned_count *
+ ORANGEFS_MAX_XATTR_NAMELEN;
+ goto done;
+ }
+
+ returned_count = new_op->downcall.resp.listxattr.returned_count;
+ if (returned_count < 0 ||
+ returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) {
+ gossip_err("%s: impossible value for returned_count:%d:\n",
+ __func__,
+ returned_count);
+ ret = -EIO;
+ goto done;
+ }
+
+ /*
+ * Check to see how much can be fit in the buffer. Fit only whole keys.
+ */
+ for (i = 0; i < returned_count; i++) {
+ if (new_op->downcall.resp.listxattr.lengths[i] < 0 ||
+ new_op->downcall.resp.listxattr.lengths[i] >
+ ORANGEFS_MAX_XATTR_NAMELEN) {
+ gossip_err("%s: impossible value for lengths[%d]\n",
+ __func__,
+ new_op->downcall.resp.listxattr.lengths[i]);
+ ret = -EIO;
+ goto done;
+ }
+ if (total + new_op->downcall.resp.listxattr.lengths[i] > size)
+ goto done;
+
+ /*
+ * Since many dumb programs try to setxattr() on our reserved
+ * xattrs this is a feeble attempt at defeating those by not
+ * listing them in the output of listxattr.. sigh
+ */
+ if (is_reserved_key(new_op->downcall.resp.listxattr.key +
+ key_size,
+ new_op->downcall.resp.
+ listxattr.lengths[i])) {
+ gossip_debug(GOSSIP_XATTR_DEBUG, "Copying key %d -> %s\n",
+ i, new_op->downcall.resp.listxattr.key +
+ key_size);
+ memcpy(buffer + total,
+ new_op->downcall.resp.listxattr.key + key_size,
+ new_op->downcall.resp.listxattr.lengths[i]);
+ total += new_op->downcall.resp.listxattr.lengths[i];
+ count_keys++;
+ } else {
+ gossip_debug(GOSSIP_XATTR_DEBUG, "[RESERVED] key %d -> %s\n",
+ i, new_op->downcall.resp.listxattr.key +
+ key_size);
+ }
+ key_size += new_op->downcall.resp.listxattr.lengths[i];
+ }
+
+ /*
+ * Since the buffer was large enough, we might have to continue
+ * fetching more keys!
+ */
+ token = new_op->downcall.resp.listxattr.token;
+ if (token != ORANGEFS_ITERATE_END)
+ goto try_again;
+
+done:
+ gossip_debug(GOSSIP_XATTR_DEBUG, "%s: returning %d"
+ " [size of buffer %ld] (filled in %d keys)\n",
+ __func__,
+ ret ? (int)ret : (int)total,
+ (long)size,
+ count_keys);
+ op_release(new_op);
+ if (ret == 0)
+ ret = total;
+out_unlock:
+ up_read(&orangefs_inode->xattr_sem);
+ return ret;
+}
+
+static int orangefs_xattr_set_default(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name,
+ const void *buffer,
+ size_t size,
+ int flags)
+{
+ return orangefs_inode_setxattr(dentry->d_inode,
+ ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ name,
+ buffer,
+ size,
+ flags);
+}
+
+static int orangefs_xattr_get_default(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name,
+ void *buffer,
+ size_t size)
+{
+ return orangefs_inode_getxattr(dentry->d_inode,
+ ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ name,
+ buffer,
+ size);
+
+}
+
+static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name,
+ const void *buffer,
+ size_t size,
+ int flags)
+{
+ return orangefs_inode_setxattr(dentry->d_inode,
+ ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
+ name,
+ buffer,
+ size,
+ flags);
+}
+
+static int orangefs_xattr_get_trusted(const struct xattr_handler *handler,
+ struct dentry *dentry,
+ const char *name,
+ void *buffer,
+ size_t size)
+{
+ return orangefs_inode_getxattr(dentry->d_inode,
+ ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
+ name,
+ buffer,
+ size);
+}
+
+static struct xattr_handler orangefs_xattr_trusted_handler = {
+ .prefix = ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
+ .get = orangefs_xattr_get_trusted,
+ .set = orangefs_xattr_set_trusted,
+};
+
+static struct xattr_handler orangefs_xattr_default_handler = {
+ /*
+ * NOTE: this is set to be the empty string.
+ * so that all un-prefixed xattrs keys get caught
+ * here!
+ */
+ .prefix = ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
+ .get = orangefs_xattr_get_default,
+ .set = orangefs_xattr_set_default,
+};
+
+const struct xattr_handler *orangefs_xattr_handlers[] = {
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+ &orangefs_xattr_trusted_handler,
+ &orangefs_xattr_default_handler,
+ NULL
+};
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 871fcb67be97..cc514da6f3e7 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -7,6 +7,7 @@
* the Free Software Foundation.
*/
+#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
@@ -16,15 +17,46 @@
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
+#include <linux/fdtable.h>
+#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
+static bool __read_mostly ovl_check_copy_up;
+module_param_named(check_copy_up, ovl_check_copy_up, bool,
+ S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(ovl_check_copy_up,
+ "Warn on copy-up when causing process also has a R/O fd open");
+
+static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
+{
+ const struct dentry *dentry = data;
+
+ if (f->f_inode == d_inode(dentry))
+ pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
+ f, fd, current->pid, current->comm);
+ return 0;
+}
+
+/*
+ * Check the fds open by this process and warn if something like the following
+ * scenario is about to occur:
+ *
+ * fd1 = open("foo", O_RDONLY);
+ * fd2 = open("foo", O_RDWR);
+ */
+static void ovl_do_check_copy_up(struct dentry *dentry)
+{
+ if (ovl_check_copy_up)
+ iterate_fd(current->files, 0, ovl_check_fd, dentry);
+}
+
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
- ssize_t list_size, size;
- char *buf, *name, *value;
- int error;
+ ssize_t list_size, size, value_size = 0;
+ char *buf, *name, *value = NULL;
+ int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
@@ -41,29 +73,40 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
if (!buf)
return -ENOMEM;
- error = -ENOMEM;
- value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
- if (!value)
- goto out;
-
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
- goto out_free_value;
+ goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
- size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
- if (size <= 0) {
+retry:
+ size = vfs_getxattr(old, name, value, value_size);
+ if (size == -ERANGE)
+ size = vfs_getxattr(old, name, NULL, 0);
+
+ if (size < 0) {
error = size;
- goto out_free_value;
+ break;
}
+
+ if (size > value_size) {
+ void *new;
+
+ new = krealloc(value, size, GFP_KERNEL);
+ if (!new) {
+ error = -ENOMEM;
+ break;
+ }
+ value = new;
+ value_size = size;
+ goto retry;
+ }
+
error = vfs_setxattr(new, name, value, size, 0);
if (error)
- goto out_free_value;
+ break;
}
-
-out_free_value:
kfree(value);
out:
kfree(buf);
@@ -195,8 +238,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
- struct kstat *stat, struct iattr *attr,
- const char *link)
+ struct kstat *stat, const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
@@ -225,6 +267,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
if (S_ISREG(stat->mode)) {
struct path upperpath;
+
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
@@ -238,11 +281,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
if (err)
goto out_cleanup;
- mutex_lock(&newdentry->d_inode->i_mutex);
+ inode_lock(newdentry->d_inode);
err = ovl_set_attr(newdentry, stat);
- if (!err && attr)
- err = notify_change(newdentry, attr, NULL);
- mutex_unlock(&newdentry->d_inode->i_mutex);
+ inode_unlock(newdentry->d_inode);
if (err)
goto out_cleanup;
@@ -286,8 +327,7 @@ out_cleanup:
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
- struct path *lowerpath, struct kstat *stat,
- struct iattr *attr)
+ struct path *lowerpath, struct kstat *stat)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
@@ -302,6 +342,8 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
if (WARN_ON(!workdir))
return -EROFS;
+ ovl_do_check_copy_up(lowerpath->dentry);
+
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
@@ -345,26 +387,19 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
- unlock_rename(workdir, upperdir);
+ /* Raced with another copy-up? Nothing to do, then... */
err = 0;
- /* Raced with another copy-up? Do the setattr here */
- if (attr) {
- mutex_lock(&upperdentry->d_inode->i_mutex);
- err = notify_change(upperdentry, attr, NULL);
- mutex_unlock(&upperdentry->d_inode->i_mutex);
- }
- goto out_put_cred;
+ goto out_unlock;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
- stat, attr, link);
+ stat, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
-out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
@@ -406,7 +441,7 @@ int ovl_copy_up(struct dentry *dentry)
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
- err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
dput(parent);
dput(next);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 692ceda3bc21..b3fc0a35bf62 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -167,7 +167,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct dentry *newdentry;
int err;
- mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(udir, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
@@ -185,7 +185,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
out_dput:
dput(newdentry);
out_unlock:
- mutex_unlock(&udir->i_mutex);
+ inode_unlock(udir);
return err;
}
@@ -258,9 +258,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (err)
goto out_cleanup;
- mutex_lock(&opaquedir->d_inode->i_mutex);
+ inode_lock(opaquedir->d_inode);
err = ovl_set_attr(opaquedir, &stat);
- mutex_unlock(&opaquedir->d_inode->i_mutex);
+ inode_unlock(opaquedir->d_inode);
if (err)
goto out_cleanup;
@@ -596,21 +596,25 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
- struct dentry *upper = ovl_dentry_upper(dentry);
+ struct dentry *upper;
int err;
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ upper = lookup_one_len(dentry->d_name.name, upperdir,
+ dentry->d_name.len);
+ err = PTR_ERR(upper);
+ if (IS_ERR(upper))
+ goto out_unlock;
+
err = -ESTALE;
- if (upper->d_parent == upperdir) {
- /* Don't let d_delete() think it can reset d_inode */
- dget(upper);
+ if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
- dput(upper);
ovl_dentry_version_inc(dentry->d_parent);
}
+ dput(upper);
/*
* Keeping this dentry hashed would mean having to release
@@ -618,8 +622,10 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
- d_drop(dentry);
- mutex_unlock(&dir->i_mutex);
+ if (!err)
+ d_drop(dentry);
+out_unlock:
+ inode_unlock(dir);
return err;
}
@@ -713,7 +719,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct dentry *trap;
bool old_opaque;
bool new_opaque;
- bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
@@ -839,29 +844,38 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
trap = lock_rename(new_upperdir, old_upperdir);
- olddentry = ovl_dentry_upper(old);
- newdentry = ovl_dentry_upper(new);
- if (newdentry) {
+
+ olddentry = lookup_one_len(old->d_name.name, old_upperdir,
+ old->d_name.len);
+ err = PTR_ERR(olddentry);
+ if (IS_ERR(olddentry))
+ goto out_unlock;
+
+ err = -ESTALE;
+ if (olddentry != ovl_dentry_upper(old))
+ goto out_dput_old;
+
+ newdentry = lookup_one_len(new->d_name.name, new_upperdir,
+ new->d_name.len);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_dput_old;
+
+ err = -ESTALE;
+ if (ovl_dentry_upper(new)) {
if (opaquedir) {
- newdentry = opaquedir;
- opaquedir = NULL;
+ if (newdentry != opaquedir)
+ goto out_dput;
} else {
- dget(newdentry);
+ if (newdentry != ovl_dentry_upper(new))
+ goto out_dput;
}
} else {
- new_create = true;
- newdentry = lookup_one_len(new->d_name.name, new_upperdir,
- new->d_name.len);
- err = PTR_ERR(newdentry);
- if (IS_ERR(newdentry))
- goto out_unlock;
+ if (!d_is_negative(newdentry) &&
+ (!new_opaque || !ovl_is_whiteout(newdentry)))
+ goto out_dput;
}
- err = -ESTALE;
- if (olddentry->d_parent != old_upperdir)
- goto out_dput;
- if (newdentry->d_parent != new_upperdir)
- goto out_dput;
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
@@ -903,6 +917,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
+ /*
+ * Old dentry now lives in different location. Dentries in
+ * lowerstack are stale. We cannot drop them here because
+ * access to them is lockless. This could be only pure upper
+ * or opaque directory - numlower is zero. Or upper non-dir
+ * entry - its pureness is tracked by flag opaque.
+ */
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
@@ -917,6 +938,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
out_dput:
dput(newdentry);
+out_dput_old:
+ dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index ec0c2a050043..a4ff5d0d7db9 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -12,8 +12,7 @@
#include <linux/xattr.h>
#include "overlayfs.h"
-static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
- bool no_data)
+static int ovl_copy_up_truncate(struct dentry *dentry)
{
int err;
struct dentry *parent;
@@ -30,10 +29,8 @@ static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
if (err)
goto out_dput_parent;
- if (no_data)
- stat.size = 0;
-
- err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
+ stat.size = 0;
+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
out_dput_parent:
dput(parent);
@@ -45,17 +42,32 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
int err;
struct dentry *upperdentry;
+ /*
+ * Check for permissions before trying to copy-up. This is redundant
+ * since it will be rechecked later by ->setattr() on upper dentry. But
+ * without this, copy-up can be triggered by just about anybody.
+ *
+ * We don't initialize inode->size, which just means that
+ * inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
+ * check for a swapfile (which this won't be anyway).
+ */
+ err = inode_change_ok(dentry->d_inode, attr);
+ if (err)
+ return err;
+
err = ovl_want_write(dentry);
if (err)
goto out;
- upperdentry = ovl_dentry_upper(dentry);
- if (upperdentry) {
- mutex_lock(&upperdentry->d_inode->i_mutex);
+ err = ovl_copy_up(dentry);
+ if (!err) {
+ upperdentry = ovl_dentry_upper(dentry);
+
+ inode_lock(upperdentry->d_inode);
err = notify_change(upperdentry, attr, NULL);
- mutex_unlock(&upperdentry->d_inode->i_mutex);
- } else {
- err = ovl_copy_up_last(dentry, attr, false);
+ if (!err)
+ ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
+ inode_unlock(upperdentry->d_inode);
}
ovl_drop_write(dentry);
out:
@@ -98,6 +110,29 @@ int ovl_permission(struct inode *inode, int mask)
realdentry = ovl_entry_real(oe, &is_upper);
+ if (ovl_is_default_permissions(inode)) {
+ struct kstat stat;
+ struct path realpath = { .dentry = realdentry };
+
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper);
+
+ err = vfs_getattr(&realpath, &stat);
+ if (err)
+ return err;
+
+ if ((stat.mode ^ inode->i_mode) & S_IFMT)
+ return -ESTALE;
+
+ inode->i_mode = stat.mode;
+ inode->i_uid = stat.uid;
+ inode->i_gid = stat.gid;
+
+ return generic_permission(inode, mask);
+ }
+
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
@@ -134,57 +169,23 @@ out_dput:
return err;
}
-
-struct ovl_link_data {
- struct dentry *realdentry;
- void *cookie;
-};
-
-static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
+static const char *ovl_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct dentry *realdentry;
struct inode *realinode;
- struct ovl_link_data *data = NULL;
- const char *ret;
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
- if (WARN_ON(!realinode->i_op->follow_link))
+ if (WARN_ON(!realinode->i_op->get_link))
return ERR_PTR(-EPERM);
- if (realinode->i_op->put_link) {
- data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
- if (!data)
- return ERR_PTR(-ENOMEM);
- data->realdentry = realdentry;
- }
-
- ret = realinode->i_op->follow_link(realdentry, cookie);
- if (IS_ERR_OR_NULL(ret)) {
- kfree(data);
- return ret;
- }
-
- if (data)
- data->cookie = *cookie;
-
- *cookie = data;
-
- return ret;
-}
-
-static void ovl_put_link(struct inode *unused, void *c)
-{
- struct inode *realinode;
- struct ovl_link_data *data = c;
-
- if (!data)
- return;
-
- realinode = data->realdentry->d_inode;
- realinode->i_op->put_link(realinode, data->cookie);
- kfree(data);
+ return realinode->i_op->get_link(realdentry, realinode, done);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
@@ -353,7 +354,7 @@ struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
return ERR_PTR(err);
if (file_flags & O_TRUNC)
- err = ovl_copy_up_last(dentry, NULL, true);
+ err = ovl_copy_up_truncate(dentry);
else
err = ovl_copy_up(dentry);
ovl_drop_write(dentry);
@@ -381,8 +382,7 @@ static const struct inode_operations ovl_file_inode_operations = {
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
- .follow_link = ovl_follow_link,
- .put_link = ovl_put_link,
+ .get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index ea5a40b06e3a..6a7090f4a441 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -142,7 +142,10 @@ struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
+struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
+ bool is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
+bool ovl_is_default_permissions(struct inode *inode);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
@@ -163,6 +166,7 @@ extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
+int ovl_check_d_type_supported(struct path *realpath);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
@@ -194,7 +198,6 @@ void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
- struct path *lowerpath, struct kstat *stat,
- struct iattr *attr);
+ struct path *lowerpath, struct kstat *stat);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 70e9af551600..6ec1e43a9a54 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -36,13 +36,14 @@ struct ovl_dir_cache {
struct ovl_readdir_data {
struct dir_context ctx;
- bool is_merge;
+ bool is_lowest;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
+ bool d_type_supported;
};
struct ovl_dir_file {
@@ -139,9 +140,9 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
return 0;
}
-static int ovl_fill_lower(struct ovl_readdir_data *rdd,
- const char *name, int namelen,
- loff_t offset, u64 ino, unsigned int d_type)
+static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
+ const char *name, int namelen,
+ loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
@@ -193,10 +194,10 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name,
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
- if (!rdd->is_merge)
+ if (!rdd->is_lowest)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
- return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
+ return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
@@ -228,7 +229,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
dput(dentry);
}
}
- mutex_unlock(&dir->d_inode->i_mutex);
+ inode_unlock(dir->d_inode);
}
revert_creds(old_cred);
put_cred(override_cred);
@@ -289,7 +290,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
- .is_merge = false,
+ .is_lowest = false,
};
int idx, next;
@@ -306,7 +307,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
- rdd.is_merge = true;
+ rdd.is_lowest = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
@@ -399,7 +400,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
loff_t res;
struct ovl_dir_file *od = file->private_data;
- mutex_lock(&file_inode(file)->i_mutex);
+ inode_lock(file_inode(file));
if (!file->f_pos)
ovl_dir_reset(file);
@@ -429,7 +430,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
res = offset;
}
out_unlock:
- mutex_unlock(&file_inode(file)->i_mutex);
+ inode_unlock(file_inode(file));
return res;
}
@@ -454,10 +455,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
@@ -467,7 +468,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
fput(realfile);
realfile = od->upperfile;
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
}
@@ -479,9 +480,9 @@ static int ovl_dir_release(struct inode *inode, struct file *file)
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ovl_cache_put(od, file->f_path.dentry);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
fput(od->realfile);
if (od->upperfile)
@@ -557,7 +558,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
- mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
@@ -571,8 +572,45 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
(int) PTR_ERR(dentry));
continue;
}
- ovl_cleanup(upper->d_inode, dentry);
+ if (dentry->d_inode)
+ ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
- mutex_unlock(&upper->d_inode->i_mutex);
+ inode_unlock(upper->d_inode);
+}
+
+static int ovl_check_d_type(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ovl_readdir_data *rdd =
+ container_of(ctx, struct ovl_readdir_data, ctx);
+
+ /* Even if d_type is not supported, DT_DIR is returned for . and .. */
+ if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
+ return 0;
+
+ if (d_type != DT_UNKNOWN)
+ rdd->d_type_supported = true;
+
+ return 0;
+}
+
+/*
+ * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
+ * if error is encountered.
+ */
+int ovl_check_d_type_supported(struct path *realpath)
+{
+ int err;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_check_d_type,
+ .d_type_supported = false,
+ };
+
+ err = ovl_dir_read(realpath, &rdd);
+ if (err)
+ return err;
+
+ return rdd.d_type_supported;
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e38ee0fed24a..5d972e6cd3fe 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -9,12 +9,14 @@
#include <linux/fs.h>
#include <linux/namei.h>
+#include <linux/pagemap.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/slab.h>
#include <linux/parser.h>
#include <linux/module.h>
+#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/statfs.h>
#include <linux/seq_file.h>
@@ -24,12 +26,11 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Overlay filesystem");
MODULE_LICENSE("GPL");
-#define OVERLAYFS_SUPER_MAGIC 0x794c7630
-
struct ovl_config {
char *lowerdir;
char *upperdir;
char *workdir;
+ bool default_permissions;
};
/* private information held for overlayfs's superblock */
@@ -75,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
if (oe->__upperdentry) {
type = __OVL_PATH_UPPER;
- if (oe->numlower) {
- if (S_ISDIR(dentry->d_inode->i_mode))
- type |= __OVL_PATH_MERGE;
- } else if (!oe->opaque) {
+ /*
+ * Non-dir dentry can hold lower dentry from previous
+ * location. Its purity depends only on opaque flag.
+ */
+ if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+ type |= __OVL_PATH_MERGE;
+ else if (!oe->opaque)
type |= __OVL_PATH_PURE;
- }
} else {
if (oe->numlower > 1)
type |= __OVL_PATH_MERGE;
@@ -154,6 +157,18 @@ struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
return realdentry;
}
+struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
+ bool is_upper)
+{
+ if (is_upper) {
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+ return ofs->upper_mnt;
+ } else {
+ return oe->numlower ? oe->lowerstack[0].mnt : NULL;
+ }
+}
+
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -161,6 +176,13 @@ struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
return oe->cache;
}
+bool ovl_is_default_permissions(struct inode *inode)
+{
+ struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+ return ofs->config.default_permissions;
+}
+
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -209,7 +231,7 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
- WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+ WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
WARN_ON(oe->__upperdentry);
BUG_ON(!upperdentry->d_inode);
/*
@@ -224,7 +246,7 @@ void ovl_dentry_version_inc(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
- WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ WARN_ON(!inode_is_locked(dentry->d_inode));
oe->version++;
}
@@ -232,7 +254,7 @@ u64 ovl_dentry_version_get(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
- WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+ WARN_ON(!inode_is_locked(dentry->d_inode));
return oe->version;
}
@@ -273,6 +295,37 @@ static void ovl_dentry_release(struct dentry *dentry)
}
}
+static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode)
+{
+ struct dentry *real;
+
+ if (d_is_dir(dentry)) {
+ if (!inode || inode == d_inode(dentry))
+ return dentry;
+ goto bug;
+ }
+
+ real = ovl_dentry_upper(dentry);
+ if (real && (!inode || inode == d_inode(real)))
+ return real;
+
+ real = ovl_dentry_lower(dentry);
+ if (!real)
+ goto bug;
+
+ if (!inode || inode == d_inode(real))
+ return real;
+
+ /* Handle recursion */
+ if (real->d_flags & DCACHE_OP_REAL)
+ return real->d_op->d_real(real, inode);
+
+bug:
+ WARN(1, "ovl_d_real(%pd4, %s:%lu\n): real dentry not found\n", dentry,
+ inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
+ return dentry;
+}
+
static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -317,10 +370,13 @@ static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
.d_select_inode = ovl_d_select_inode,
+ .d_real = ovl_d_real,
};
static const struct dentry_operations ovl_reval_dentry_operations = {
.d_release = ovl_dentry_release,
+ .d_select_inode = ovl_d_select_inode,
+ .d_real = ovl_d_real,
.d_revalidate = ovl_dentry_revalidate,
.d_weak_revalidate = ovl_dentry_weak_revalidate,
};
@@ -355,9 +411,9 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
{
struct dentry *dentry;
- mutex_lock(&dir->d_inode->i_mutex);
+ inode_lock(dir->d_inode);
dentry = lookup_one_len(name->name, dir, name->len);
- mutex_unlock(&dir->d_inode->i_mutex);
+ inode_unlock(dir->d_inode);
if (IS_ERR(dentry)) {
if (PTR_ERR(dentry) == -ENOENT)
@@ -594,6 +650,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
seq_show_option(m, "upperdir", ufs->config.upperdir);
seq_show_option(m, "workdir", ufs->config.workdir);
}
+ if (ufs->config.default_permissions)
+ seq_puts(m, ",default_permissions");
return 0;
}
@@ -618,6 +676,7 @@ enum {
OPT_LOWERDIR,
OPT_UPPERDIR,
OPT_WORKDIR,
+ OPT_DEFAULT_PERMISSIONS,
OPT_ERR,
};
@@ -625,6 +684,7 @@ static const match_table_t ovl_tokens = {
{OPT_LOWERDIR, "lowerdir=%s"},
{OPT_UPPERDIR, "upperdir=%s"},
{OPT_WORKDIR, "workdir=%s"},
+ {OPT_DEFAULT_PERMISSIONS, "default_permissions"},
{OPT_ERR, NULL}
};
@@ -685,6 +745,10 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
return -ENOMEM;
break;
+ case OPT_DEFAULT_PERMISSIONS:
+ config->default_permissions = true;
+ break;
+
default:
pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
return -EINVAL;
@@ -716,7 +780,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
if (err)
return ERR_PTR(err);
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
retry:
work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
strlen(OVL_WORKDIR_NAME));
@@ -742,7 +806,7 @@ retry:
goto out_dput;
}
out_unlock:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
mnt_drop_write(mnt);
return work;
@@ -905,11 +969,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
err = -EINVAL;
if (!ufs->config.lowerdir) {
- pr_err("overlayfs: missing 'lowerdir'\n");
+ if (!silent)
+ pr_err("overlayfs: missing 'lowerdir'\n");
goto out_free_config;
}
sb->s_stack_depth = 0;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
if (ufs->config.upperdir) {
if (!ufs->config.workdir) {
pr_err("overlayfs: missing 'workdir'\n");
@@ -996,6 +1062,21 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags |= MS_RDONLY;
ufs->workdir = NULL;
}
+
+ /*
+ * Upper should support d_type, else whiteouts are visible.
+ * Given workdir and upper are on same fs, we can do
+ * iterate_dir() on workdir.
+ */
+ err = ovl_check_d_type_supported(&workpath);
+ if (err < 0)
+ goto out_put_workdir;
+
+ if (!err) {
+ pr_err("overlayfs: upper fs needs to support d_type.\n");
+ err = -EINVAL;
+ goto out_put_workdir;
+ }
}
err = -ENOMEM;
@@ -1053,6 +1134,9 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
root_dentry->d_fsdata = oe;
+ ovl_copyattr(ovl_dentry_real(root_dentry)->d_inode,
+ root_dentry->d_inode);
+
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
sb->s_op = &ovl_super_operations;
sb->s_root = root_dentry;
diff --git a/fs/pipe.c b/fs/pipe.c
index 8865f7963700..0d3f5165cb0b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576;
*/
unsigned int pipe_min_size = PAGE_SIZE;
+/* Maximum allocatable pages per user. Hard limit is unset by default, soft
+ * matches default values.
+ */
+unsigned long pipe_user_pages_hard;
+unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
+
/*
* We use a start+len construction, which provides full use of the
* allocated memory.
@@ -128,7 +134,7 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
if (page_count(page) == 1 && !pipe->tmp_page)
pipe->tmp_page = page;
else
- page_cache_release(page);
+ put_page(page);
}
/**
@@ -174,7 +180,7 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
*/
void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
- page_cache_get(buf->page);
+ get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
@@ -205,7 +211,7 @@ EXPORT_SYMBOL(generic_pipe_buf_confirm);
void generic_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- page_cache_release(buf->page);
+ put_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_release);
@@ -366,18 +372,17 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
int offset = buf->offset + buf->len;
if (ops->can_merge && offset + chars <= PAGE_SIZE) {
- int error = ops->confirm(pipe, buf);
- if (error)
+ ret = ops->confirm(pipe, buf);
+ if (ret)
goto out;
ret = copy_page_from_iter(buf->page, offset, chars, from);
if (unlikely(ret < chars)) {
- error = -EFAULT;
+ ret = -EFAULT;
goto out;
}
do_wakeup = 1;
- buf->len += chars;
- ret = chars;
+ buf->len += ret;
if (!iov_iter_count(from))
goto out;
}
@@ -584,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on)
return retval;
}
+static void account_pipe_buffers(struct pipe_inode_info *pipe,
+ unsigned long old, unsigned long new)
+{
+ atomic_long_add(new - old, &pipe->user->pipe_bufs);
+}
+
+static bool too_many_pipe_buffers_soft(struct user_struct *user)
+{
+ return pipe_user_pages_soft &&
+ atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft;
+}
+
+static bool too_many_pipe_buffers_hard(struct user_struct *user)
+{
+ return pipe_user_pages_hard &&
+ atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard;
+}
+
struct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
if (pipe) {
- pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+ unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
+ struct user_struct *user = get_current_user();
+
+ if (!too_many_pipe_buffers_hard(user)) {
+ if (too_many_pipe_buffers_soft(user))
+ pipe_bufs = 1;
+ pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL);
+ }
+
if (pipe->bufs) {
init_waitqueue_head(&pipe->wait);
pipe->r_counter = pipe->w_counter = 1;
- pipe->buffers = PIPE_DEF_BUFFERS;
+ pipe->buffers = pipe_bufs;
+ pipe->user = user;
+ account_pipe_buffers(pipe, 0, pipe_bufs);
mutex_init(&pipe->mutex);
return pipe;
}
+ free_uid(user);
kfree(pipe);
}
@@ -608,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
+ account_pipe_buffers(pipe, pipe->buffers, 0);
+ free_uid(pipe->user);
for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
@@ -693,17 +729,20 @@ int create_pipe_files(struct file **res, int flags)
d_instantiate(path.dentry, inode);
- err = -ENFILE;
f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
- if (IS_ERR(f))
+ if (IS_ERR(f)) {
+ err = PTR_ERR(f);
goto err_dentry;
+ }
f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
f->private_data = inode->i_pipe;
res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
- if (IS_ERR(res[0]))
+ if (IS_ERR(res[0])) {
+ err = PTR_ERR(res[0]);
goto err_file;
+ }
path_get(&path);
res[0]->private_data = inode->i_pipe;
@@ -996,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
}
+ account_pipe_buffers(pipe, pipe->buffers, nr_pages);
pipe->curbuf = 0;
kfree(pipe->bufs);
pipe->bufs = bufs;
@@ -1067,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
ret = -EPERM;
goto out;
+ } else if ((too_many_pipe_buffers_hard(pipe->user) ||
+ too_many_pipe_buffers_soft(pipe->user)) &&
+ !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out;
}
ret = pipe_set_size(pipe, nr_pages);
break;
diff --git a/fs/pnode.c b/fs/pnode.c
index 6367e1e435c6..c524fdddc7fb 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -202,6 +202,11 @@ static struct mount *last_dest, *last_source, *dest_master;
static struct mountpoint *mp;
static struct hlist_head *list;
+static inline bool peers(struct mount *m1, struct mount *m2)
+{
+ return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
+}
+
static int propagate_one(struct mount *m)
{
struct mount *child;
@@ -212,7 +217,7 @@ static int propagate_one(struct mount *m)
/* skip if mountpoint isn't covered by it */
if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
return 0;
- if (m->mnt_group_id == last_dest->mnt_group_id) {
+ if (peers(m, last_dest)) {
type = CL_MAKE_SHARED;
} else {
struct mount *n, *p;
@@ -223,7 +228,7 @@ static int propagate_one(struct mount *m)
last_source = last_source->mnt_master;
last_dest = last_source->mnt_parent;
}
- if (n->mnt_group_id != last_dest->mnt_group_id) {
+ if (!peers(n, last_dest)) {
last_source = last_source->mnt_master;
last_dest = last_source->mnt_parent;
}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4fb17ded7d47..711dd5170376 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -762,8 +762,9 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
EXPORT_SYMBOL (posix_acl_to_xattr);
static int
-posix_acl_xattr_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int type)
+posix_acl_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *value, size_t size)
{
struct posix_acl *acl;
int error;
@@ -773,7 +774,7 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
if (d_is_symlink(dentry))
return -EOPNOTSUPP;
- acl = get_acl(d_backing_inode(dentry), type);
+ acl = get_acl(d_backing_inode(dentry), handler->flags);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -786,8 +787,9 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
}
static int
-posix_acl_xattr_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct inode *inode = d_backing_inode(dentry);
struct posix_acl *acl = NULL;
@@ -798,7 +800,7 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name,
if (!inode->i_op->set_acl)
return -EOPNOTSUPP;
- if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
return value ? -EACCES : 0;
if (!inode_owner_or_capable(inode))
return -EPERM;
@@ -815,37 +817,20 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name,
}
}
- ret = inode->i_op->set_acl(inode, acl, type);
+ ret = inode->i_op->set_acl(inode, acl, handler->flags);
out:
posix_acl_release(acl);
return ret;
}
-static size_t
-posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static bool
+posix_acl_xattr_list(struct dentry *dentry)
{
- const char *xname;
- size_t size;
-
- if (!IS_POSIXACL(d_backing_inode(dentry)))
- return -EOPNOTSUPP;
- if (d_is_symlink(dentry))
- return -EOPNOTSUPP;
-
- if (type == ACL_TYPE_ACCESS)
- xname = POSIX_ACL_XATTR_ACCESS;
- else
- xname = POSIX_ACL_XATTR_DEFAULT;
-
- size = strlen(xname) + 1;
- if (list && size <= list_size)
- memcpy(list, xname, size);
- return size;
+ return IS_POSIXACL(d_backing_inode(dentry));
}
const struct xattr_handler posix_acl_access_xattr_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = posix_acl_xattr_list,
.get = posix_acl_xattr_get,
@@ -854,7 +839,7 @@ const struct xattr_handler posix_acl_access_xattr_handler = {
EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
const struct xattr_handler posix_acl_default_xattr_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = posix_acl_xattr_list,
.get = posix_acl_xattr_get,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index eed2050db9be..b6c00ce0e29e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -91,18 +91,18 @@
static inline void task_name(struct seq_file *m, struct task_struct *p)
{
char *buf;
+ size_t size;
char tcomm[sizeof(p->comm)];
+ int ret;
get_task_comm(tcomm, p);
seq_puts(m, "Name:\t");
- buf = m->buf + m->count;
- /* Ignore error for now */
- buf += string_escape_str(tcomm, buf, m->size - m->count,
- ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ size = seq_get_buf(m, &buf);
+ ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ seq_commit(m, ret < size ? ret : -1);
- m->count = buf - m->buf;
seq_putc(m, '\n');
}
@@ -395,7 +395,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
state = *get_task_state(task);
vsize = eip = esp = 0;
- permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
+ permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT);
mm = get_task_mm(task);
if (mm) {
vsize = task_vsize(mm);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index bd3e9e68125b..b1755b23893e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -403,7 +403,7 @@ static const struct file_operations proc_pid_cmdline_ops = {
static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
- struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
+ struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (mm && !IS_ERR(mm)) {
unsigned int nwords = 0;
do {
@@ -430,10 +430,11 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
wchan = get_wchan(task);
- if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname))
+ if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)
+ && !lookup_symbol_name(wchan, symname))
seq_printf(m, "%s", symname);
else
- seq_putc(m, '0');
+ seq_puts(m, "0\n");
return 0;
}
@@ -444,7 +445,7 @@ static int lock_trace(struct task_struct *task)
int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
if (err)
return err;
- if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
mutex_unlock(&task->signal->cred_guard_mutex);
return -EPERM;
}
@@ -697,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode)
*/
task = get_proc_task(inode);
if (task) {
- allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
put_task_struct(task);
}
return allowed;
@@ -732,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
return true;
if (in_group_p(pid->pid_gid))
return true;
- return ptrace_may_access(task, PTRACE_MODE_READ);
+ return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
}
@@ -809,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
struct mm_struct *mm = ERR_PTR(-ESRCH);
if (task) {
- mm = mm_access(task, mode);
+ mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
put_task_struct(task);
if (!IS_ERR_OR_NULL(mm)) {
@@ -952,6 +953,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
unsigned long src = *ppos;
int ret = 0;
struct mm_struct *mm = file->private_data;
+ unsigned long env_start, env_end;
if (!mm)
return 0;
@@ -963,19 +965,25 @@ static ssize_t environ_read(struct file *file, char __user *buf,
ret = 0;
if (!atomic_inc_not_zero(&mm->mm_users))
goto free;
+
+ down_read(&mm->mmap_sem);
+ env_start = mm->env_start;
+ env_end = mm->env_end;
+ up_read(&mm->mmap_sem);
+
while (count > 0) {
size_t this_len, max_len;
int retval;
- if (src >= (mm->env_end - mm->env_start))
+ if (src >= (env_end - env_start))
break;
- this_len = mm->env_end - (mm->env_start + src);
+ this_len = env_end - (env_start + src);
max_len = min_t(size_t, PAGE_SIZE, count);
this_len = min(max_len, this_len);
- retval = access_remote_vm(mm, (mm->env_start + src),
+ retval = access_remote_vm(mm, (env_start + src),
page, this_len, 0);
if (retval <= 0) {
@@ -1564,12 +1572,16 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
return -ENOENT;
}
-static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_pid_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
struct path path;
int error = -EACCES;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
/* Are we allowed to snoop on the tasks file descriptors? */
if (!proc_fd_access_allowed(inode))
goto out;
@@ -1630,7 +1642,7 @@ out:
const struct inode_operations proc_pid_link_inode_operations = {
.readlink = proc_pid_readlink,
- .follow_link = proc_pid_follow_link,
+ .get_link = proc_pid_get_link,
.setattr = proc_setattr,
};
@@ -1856,7 +1868,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (!task)
goto out_notask;
- mm = mm_access(task, PTRACE_MODE_READ);
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (IS_ERR_OR_NULL(mm))
goto out;
@@ -1895,7 +1907,7 @@ static const struct dentry_operations tid_map_files_dentry_operations = {
.d_delete = pid_delete_dentry,
};
-static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+static int map_files_get_link(struct dentry *dentry, struct path *path)
{
unsigned long vm_start, vm_end;
struct vm_area_struct *vma;
@@ -1945,20 +1957,22 @@ struct map_files_info {
* path to the file in question.
*/
static const char *
-proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+proc_map_files_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
- return proc_pid_follow_link(dentry, NULL);
+ return proc_pid_get_link(dentry, inode, done);
}
/*
- * Identical to proc_pid_link_inode_operations except for follow_link()
+ * Identical to proc_pid_link_inode_operations except for get_link()
*/
static const struct inode_operations proc_map_files_link_inode_operations = {
.readlink = proc_pid_readlink,
- .follow_link = proc_map_files_follow_link,
+ .get_link = proc_map_files_get_link,
.setattr = proc_setattr,
};
@@ -1975,7 +1989,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
return -ENOENT;
ei = PROC_I(inode);
- ei->op.proc_get_link = proc_map_files_get_link;
+ ei->op.proc_get_link = map_files_get_link;
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
@@ -2007,7 +2021,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
goto out;
result = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto out_put_task;
result = -ENOENT;
@@ -2060,7 +2074,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
goto out;
ret = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
goto out_put_task;
ret = 0;
@@ -2144,6 +2158,7 @@ static const struct file_operations proc_map_files_operations = {
.llseek = default_llseek,
};
+#ifdef CONFIG_CHECKPOINT_RESTORE
struct timers_private {
struct pid *pid;
struct task_struct *task;
@@ -2242,6 +2257,73 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
+#endif
+
+static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file_inode(file);
+ struct task_struct *p;
+ u64 slack_ns;
+ int err;
+
+ err = kstrtoull_from_user(buf, count, 10, &slack_ns);
+ if (err < 0)
+ return err;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+ task_lock(p);
+ if (slack_ns == 0)
+ p->timer_slack_ns = p->default_timer_slack_ns;
+ else
+ p->timer_slack_ns = slack_ns;
+ task_unlock(p);
+ } else
+ count = -EPERM;
+
+ put_task_struct(p);
+
+ return count;
+}
+
+static int timerslack_ns_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct task_struct *p;
+ int err = 0;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+ task_lock(p);
+ seq_printf(m, "%llu\n", p->timer_slack_ns);
+ task_unlock(p);
+ } else
+ err = -EPERM;
+
+ put_task_struct(p);
+
+ return err;
+}
+
+static int timerslack_ns_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, timerslack_ns_show, inode);
+}
+
+static const struct file_operations proc_pid_set_timerslack_ns_operations = {
+ .open = timerslack_ns_open,
+ .read = seq_read,
+ .write = timerslack_ns_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2359,7 +2441,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file_inode(file);
- char *page;
+ void *page;
ssize_t length;
struct task_struct *task = get_proc_task(inode);
@@ -2374,14 +2456,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (*ppos != 0)
goto out;
- length = -ENOMEM;
- page = (char*)__get_free_page(GFP_TEMPORARY);
- if (!page)
+ page = memdup_user(buf, count);
+ if (IS_ERR(page)) {
+ length = PTR_ERR(page);
goto out;
-
- length = -EFAULT;
- if (copy_from_user(page, buf, count))
- goto out_free;
+ }
/* Guard against adverse ptrace interaction */
length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
@@ -2390,10 +2469,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
length = security_setprocattr(task,
(char*)file->f_path.dentry->d_name.name,
- (void*)page, count);
+ page, count);
mutex_unlock(&task->signal->cred_guard_mutex);
out_free:
- free_page((unsigned long) page);
+ kfree(page);
out:
put_task_struct(task);
out_no_task:
@@ -2494,6 +2573,7 @@ static ssize_t proc_coredump_filter_write(struct file *file,
mm = get_task_mm(task);
if (!mm)
goto out_no_mm;
+ ret = 0;
for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
if (val & mask)
@@ -2529,7 +2609,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
if (result)
return result;
- if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
result = -EACCES;
goto out_unlock;
}
@@ -2819,6 +2899,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_CHECKPOINT_RESTORE
REG("timers", S_IRUGO, proc_timers_operations),
#endif
+ REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6e5fcd00733e..56afa5ef08f2 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -258,6 +258,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
name, len, instantiate, p,
(void *)(unsigned long)fd))
goto out_fd_loop;
+ cond_resched();
rcu_read_lock();
}
rcu_read_unlock();
@@ -291,11 +292,19 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
*/
int proc_fd_permission(struct inode *inode, int mask)
{
- int rv = generic_permission(inode, mask);
+ struct task_struct *p;
+ int rv;
+
+ rv = generic_permission(inode, mask);
if (rv == 0)
- return 0;
- if (task_tgid(current) == proc_pid(inode))
+ return rv;
+
+ rcu_read_lock();
+ p = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (p && same_thread_group(p, current))
rv = 0;
+ rcu_read_unlock();
+
return rv;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index bd95b9fdebb0..42305ddcbaa0 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -95,7 +95,8 @@ void __init proc_init_inodecache(void)
proc_inode_cachep = kmem_cache_create("proc_inode_cache",
sizeof(struct proc_inode),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_PANIC),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+ SLAB_PANIC),
init_once);
}
@@ -393,24 +394,25 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
};
#endif
-static const char *proc_follow_link(struct dentry *dentry, void **cookie)
+static void proc_put_link(void *p)
{
- struct proc_dir_entry *pde = PDE(d_inode(dentry));
- if (unlikely(!use_pde(pde)))
- return ERR_PTR(-EINVAL);
- *cookie = pde;
- return pde->data;
+ unuse_pde(p);
}
-static void proc_put_link(struct inode *unused, void *p)
+static const char *proc_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- unuse_pde(p);
+ struct proc_dir_entry *pde = PDE(inode);
+ if (unlikely(!use_pde(pde)))
+ return ERR_PTR(-EINVAL);
+ set_delayed_call(done, proc_put_link, pde);
+ return pde->data;
}
const struct inode_operations proc_link_inode_operations = {
.readlink = generic_readlink,
- .follow_link = proc_follow_link,
- .put_link = proc_put_link,
+ .get_link = proc_get_link,
};
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 92e6726f6e37..a939f5ed7f89 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -552,9 +552,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
i_size_write(inode, proc_root_kcore->size);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 9155a5a0d3b9..83720460c5bc 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -29,10 +29,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
unsigned long committed;
long cached;
long available;
- unsigned long pagecache;
- unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
- struct zone *zone;
int lru;
/*
@@ -51,36 +48,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);
- for_each_zone(zone)
- wmark_low += zone->watermark[WMARK_LOW];
-
- /*
- * Estimate the amount of memory available for userspace allocations,
- * without causing swapping.
- *
- * Free memory cannot be taken below the low watermark, before the
- * system starts swapping.
- */
- available = i.freeram - wmark_low;
-
- /*
- * Not all the page cache can be freed, otherwise the system will
- * start swapping. Assume at least half of the page cache, or the
- * low watermark worth of cache, needs to stay.
- */
- pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
- pagecache -= min(pagecache / 2, wmark_low);
- available += pagecache;
-
- /*
- * Part of the reclaimable slab consists of items that are in use,
- * and cannot be freed. Cap this estimate at the low watermark.
- */
- available += global_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
-
- if (available < 0)
- available = 0;
+ available = si_mem_available();
/*
* Tagged format, for easy grepping and expansion.
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index f6e8354b8cea..72cb26f85d58 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -28,21 +28,28 @@ static const struct proc_ns_operations *ns_entries[] = {
&userns_operations,
#endif
&mntns_operations,
+#ifdef CONFIG_CGROUPS
+ &cgroupns_operations,
+#endif
};
-static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_ns_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
struct task_struct *task;
struct path ns_path;
void *error = ERR_PTR(-EACCES);
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
task = get_proc_task(inode);
if (!task)
return error;
- if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+ if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
error = ns_get_path(&ns_path, task, ns_ops);
if (!error)
nd_jump_link(&ns_path);
@@ -63,7 +70,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (!task)
return res;
- if (ptrace_may_access(task, PTRACE_MODE_READ)) {
+ if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
res = ns_get_name(name, sizeof(name), task, ns_ops);
if (res >= 0)
res = readlink_copy(buffer, buflen, name);
@@ -74,7 +81,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
static const struct inode_operations proc_ns_link_inode_operations = {
.readlink = proc_ns_readlink,
- .follow_link = proc_ns_follow_link,
+ .get_link = proc_ns_get_link,
.setattr = proc_setattr,
};
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 93484034a03d..712f1b9992cc 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -148,6 +148,8 @@ u64 stable_page_flags(struct page *page)
*/
if (PageBuddy(page))
u |= 1 << KPF_BUDDY;
+ else if (page_count(page) == 0 && is_free_buddy_page(page))
+ u |= 1 << KPF_BUDDY;
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
@@ -158,6 +160,8 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
+ if (PageTail(page) && PageSlab(compound_head(page)))
+ u |= 1 << KPF_SLAB;
u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 113b8d061fc0..b6a8d3529fea 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -18,26 +18,28 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
return readlink_copy(buffer, buflen, tmp);
}
-static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_self_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ struct pid_namespace *ns = inode->i_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
if (!tgid)
return ERR_PTR(-ENOENT);
/* 11 for max length of signed int in decimal + NULL term */
- name = kmalloc(12, GFP_KERNEL);
- if (!name)
- return ERR_PTR(-ENOMEM);
+ name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC);
+ if (unlikely(!name))
+ return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
sprintf(name, "%d", tgid);
- return *cookie = name;
+ set_delayed_call(done, kfree_link, name);
+ return name;
}
static const struct inode_operations proc_self_inode_operations = {
.readlink = proc_self_readlink,
- .follow_link = proc_self_follow_link,
- .put_link = kfree_put_link,
+ .get_link = proc_self_get_link,
};
static unsigned self_inum;
@@ -48,7 +50,7 @@ int proc_setup_self(struct super_block *s)
struct pid_namespace *ns = s->s_fs_info;
struct dentry *self;
- mutex_lock(&root_inode->i_mutex);
+ inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
if (self) {
struct inode *inode = new_inode_pseudo(s);
@@ -67,7 +69,7 @@ int proc_setup_self(struct super_block *s)
} else {
self = ERR_PTR(-ENOMEM);
}
- mutex_unlock(&root_inode->i_mutex);
+ inode_unlock(root_inode);
if (IS_ERR(self)) {
pr_err("proc_fill_super: can't allocate /proc/self\n");
return PTR_ERR(self);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187b3b5f242e..229cb546bee0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,6 +14,7 @@
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -22,9 +23,13 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- unsigned long data, text, lib, swap, ptes, pmds;
+ unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+ anon = get_mm_counter(mm, MM_ANONPAGES);
+ file = get_mm_counter(mm, MM_FILEPAGES);
+ shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
* hiwater_rss only when about to *lower* total_vm or rss. Any
@@ -35,11 +40,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
hiwater_vm = total_vm = mm->total_vm;
if (hiwater_vm < mm->hiwater_vm)
hiwater_vm = mm->hiwater_vm;
- hiwater_rss = total_rss = get_mm_rss(mm);
+ hiwater_rss = total_rss = anon + file + shmem;
if (hiwater_rss < mm->hiwater_rss)
hiwater_rss = mm->hiwater_rss;
- data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
swap = get_mm_counter(mm, MM_SWAPENTS);
@@ -52,6 +56,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmPin:\t%8lu kB\n"
"VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n"
+ "RssAnon:\t%8lu kB\n"
+ "RssFile:\t%8lu kB\n"
+ "RssShmem:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
@@ -65,7 +72,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
mm->pinned_vm << (PAGE_SHIFT-10),
hiwater_rss << (PAGE_SHIFT-10),
total_rss << (PAGE_SHIFT-10),
- data << (PAGE_SHIFT-10),
+ anon << (PAGE_SHIFT-10),
+ file << (PAGE_SHIFT-10),
+ shmem << (PAGE_SHIFT-10),
+ mm->data_vm << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
ptes >> 10,
pmds >> 10,
@@ -82,10 +92,11 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES);
+ *shared = get_mm_counter(mm, MM_FILEPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
- *data = mm->total_vm - mm->shared_vm;
+ *data = mm->data_vm + mm->stack_vm;
*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
return mm->total_vm;
}
@@ -248,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
-static pid_t pid_of_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma, bool is_pid)
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static int is_stack(struct proc_maps_private *priv,
+ struct vm_area_struct *vma, int is_pid)
{
- struct inode *inode = priv->inode;
- struct task_struct *task;
- pid_t ret = 0;
+ int stack = 0;
- rcu_read_lock();
- task = pid_task(proc_pid(inode), PIDTYPE_PID);
- if (task) {
- task = task_of_stack(task, vma, is_pid);
+ if (is_pid) {
+ stack = vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack;
+ } else {
+ struct inode *inode = priv->inode;
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task)
- ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+ stack = vma_is_stack_for_task(vma, task);
+ rcu_read_unlock();
}
- rcu_read_unlock();
-
- return ret;
+ return stack;
}
static void
@@ -324,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
name = arch_vma_name(vma);
if (!name) {
- pid_t tid;
-
if (!mm) {
name = "[vdso]";
goto done;
@@ -337,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
goto done;
}
- tid = pid_of_stack(priv, vma, is_pid);
- if (tid != 0) {
- /*
- * Thread stack in /proc/PID/task/TID/maps or
- * the main process stack.
- */
- if (!is_pid || (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack)) {
- name = "[stack]";
- } else {
- /* Thread stack in /proc/PID/maps */
- seq_pad(m, ' ');
- seq_printf(m, "[stack:%d]", tid);
- }
- }
+ if (is_stack(priv, vma, is_pid))
+ name = "[stack]";
}
done:
@@ -451,12 +453,14 @@ struct mem_size_stats {
unsigned long private_hugetlb;
u64 pss;
u64 swap_pss;
+ bool check_shmem_swap;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- unsigned long size, bool young, bool dirty)
+ bool compound, bool young, bool dirty)
{
- int mapcount;
+ int i, nr = compound ? 1 << compound_order(page) : 1;
+ unsigned long size = nr * PAGE_SIZE;
if (PageAnon(page))
mss->anonymous += size;
@@ -465,25 +469,52 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
/* Accumulate the size in pages that have been accessed. */
if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
- mapcount = page_mapcount(page);
- if (mapcount >= 2) {
- u64 pss_delta;
- if (dirty || PageDirty(page))
- mss->shared_dirty += size;
- else
- mss->shared_clean += size;
- pss_delta = (u64)size << PSS_SHIFT;
- do_div(pss_delta, mapcount);
- mss->pss += pss_delta;
- } else {
+ /*
+ * page_count(page) == 1 guarantees the page is mapped exactly once.
+ * If any subpage of the compound page mapped with PTE it would elevate
+ * page_count().
+ */
+ if (page_count(page) == 1) {
if (dirty || PageDirty(page))
mss->private_dirty += size;
else
mss->private_clean += size;
mss->pss += (u64)size << PSS_SHIFT;
+ return;
}
+
+ for (i = 0; i < nr; i++, page++) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2) {
+ if (dirty || PageDirty(page))
+ mss->shared_dirty += PAGE_SIZE;
+ else
+ mss->shared_clean += PAGE_SIZE;
+ mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+ } else {
+ if (dirty || PageDirty(page))
+ mss->private_dirty += PAGE_SIZE;
+ else
+ mss->private_clean += PAGE_SIZE;
+ mss->pss += PAGE_SIZE << PSS_SHIFT;
+ }
+ }
+}
+
+#ifdef CONFIG_SHMEM
+static int smaps_pte_hole(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+
+ mss->swap += shmem_partial_swap_usage(
+ walk->vma->vm_file->f_mapping, addr, end);
+
+ return 0;
}
+#endif
static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
@@ -512,11 +543,25 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+ } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
+ && pte_none(*pte))) {
+ page = find_get_entry(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
+ if (!page)
+ return;
+
+ if (radix_tree_exceptional_entry(page))
+ mss->swap += PAGE_SIZE;
+ else
+ put_page(page);
+
+ return;
}
if (!page)
return;
- smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -532,8 +577,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
if (IS_ERR_OR_NULL(page))
return;
mss->anonymous_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, HPAGE_PMD_SIZE,
- pmd_young(*pmd), pmd_dirty(*pmd));
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -549,7 +593,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
smaps_pmd_entry(pmd, addr, walk);
spin_unlock(ptl);
return 0;
@@ -615,11 +660,20 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ /* These come out via ProtectionKey: */
+ [ilog2(VM_PKEY_BIT0)] = "",
+ [ilog2(VM_PKEY_BIT1)] = "",
+ [ilog2(VM_PKEY_BIT2)] = "",
+ [ilog2(VM_PKEY_BIT3)] = "",
+#endif
};
size_t i;
seq_puts(m, "VmFlags: ");
for (i = 0; i < BITS_PER_LONG; i++) {
+ if (!mnemonics[i][0])
+ continue;
if (vma->vm_flags & (1UL << i)) {
seq_printf(m, "%c%c ",
mnemonics[i][0], mnemonics[i][1]);
@@ -657,6 +711,10 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
}
#endif /* HUGETLB_PAGE */
+void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
+{
+}
+
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
@@ -671,6 +729,31 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
};
memset(&mss, 0, sizeof mss);
+
+#ifdef CONFIG_SHMEM
+ if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
+ /*
+ * For shared or readonly shmem mappings we know that all
+ * swapped out pages belong to the shmem object, and we can
+ * obtain the swap value much more efficiently. For private
+ * writable mappings, we might have COW pages that are
+ * not affected by the parent swapped out pages of the shmem
+ * object, so we have to distinguish them during the page walk.
+ * Unless we know that the shmem object (or the part mapped by
+ * our VMA) has no swapped out pages at all.
+ */
+ unsigned long shmem_swapped = shmem_swap_usage(vma);
+
+ if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+ !(vma->vm_flags & VM_WRITE)) {
+ mss.swap = shmem_swapped;
+ } else {
+ mss.check_shmem_swap = true;
+ smaps_walk.pte_hole = smaps_pte_hole;
+ }
+ }
+#endif
+
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
@@ -713,6 +796,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
(vma->vm_flags & VM_LOCKED) ?
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
+ arch_show_smap(m, vma);
show_smap_vma_flags(m, vma);
m_cache_vma(m, vma);
return 0;
@@ -817,9 +901,6 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
pmd = pmd_wrprotect(pmd);
pmd = pmd_clear_soft_dirty(pmd);
- if (vma->vm_flags & VM_SOFTDIRTY)
- vma->vm_flags &= ~VM_SOFTDIRTY;
-
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
#else
@@ -838,7 +919,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
clear_soft_dirty_pmd(vma, addr, pmd);
goto out;
@@ -1112,7 +1194,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
int err = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmdp, vma);
+ if (ptl) {
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
@@ -1444,7 +1527,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
pte_t *orig_pte;
pte_t *pte;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
@@ -1473,18 +1557,19 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
+ pte_t huge_pte = huge_ptep_get(pte);
struct numa_maps *md;
struct page *page;
- if (!pte_present(*pte))
+ if (!pte_present(huge_pte))
return 0;
- page = pte_page(*pte);
+ page = pte_page(huge_pte);
if (!page)
return 0;
md = walk->private;
- gather_stats(page, md, pte_dirty(*pte), 1);
+ gather_stats(page, md, pte_dirty(huge_pte), 1);
return 0;
}
@@ -1538,19 +1623,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
seq_file_path(m, file, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_puts(m, " heap");
- } else {
- pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
- if (tid != 0) {
- /*
- * Thread stack in /proc/PID/task/TID/maps or
- * the main process stack.
- */
- if (!is_pid || (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack))
- seq_puts(m, " stack");
- else
- seq_printf(m, " stack:%d", tid);
- }
+ } else if (is_stack(proc_priv, vma, is_pid)) {
+ seq_puts(m, " stack");
}
if (is_vm_hugetlb_page(vma))
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index e0d64c92e4f6..faacb0c0d857 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static pid_t pid_of_stack(struct proc_maps_private *priv,
- struct vm_area_struct *vma, bool is_pid)
+static int is_stack(struct proc_maps_private *priv,
+ struct vm_area_struct *vma, int is_pid)
{
- struct inode *inode = priv->inode;
- struct task_struct *task;
- pid_t ret = 0;
-
- rcu_read_lock();
- task = pid_task(proc_pid(inode), PIDTYPE_PID);
- if (task) {
- task = task_of_stack(task, vma, is_pid);
+ struct mm_struct *mm = vma->vm_mm;
+ int stack = 0;
+
+ if (is_pid) {
+ stack = vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack;
+ } else {
+ struct inode *inode = priv->inode;
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task)
- ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+ stack = vma_is_stack_for_task(vma, task);
+ rcu_read_unlock();
}
- rcu_read_unlock();
-
- return ret;
+ return stack;
}
/*
@@ -181,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
if (file) {
seq_pad(m, ' ');
seq_file_path(m, file, "");
- } else if (mm) {
- pid_t tid = pid_of_stack(priv, vma, is_pid);
-
- if (tid != 0) {
- seq_pad(m, ' ');
- /*
- * Thread stack in /proc/PID/task/TID/maps or
- * the main process stack.
- */
- if (!is_pid || (vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack))
- seq_printf(m, "[stack]");
- else
- seq_printf(m, "[stack:%d]", tid);
- }
+ } else if (mm && is_stack(priv, vma, is_pid)) {
+ seq_pad(m, ' ');
+ seq_printf(m, "[stack]");
}
seq_putc(m, '\n');
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 947b0f4fd0a1..e58a31e8fb2a 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -19,26 +19,29 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
return readlink_copy(buffer, buflen, tmp);
}
-static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_thread_self_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+ struct pid_namespace *ns = inode->i_sb->s_fs_info;
pid_t tgid = task_tgid_nr_ns(current, ns);
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
if (!pid)
return ERR_PTR(-ENOENT);
- name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
- if (!name)
- return ERR_PTR(-ENOMEM);
+ name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF,
+ dentry ? GFP_KERNEL : GFP_ATOMIC);
+ if (unlikely(!name))
+ return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
sprintf(name, "%d/task/%d", tgid, pid);
- return *cookie = name;
+ set_delayed_call(done, kfree_link, name);
+ return name;
}
static const struct inode_operations proc_thread_self_inode_operations = {
.readlink = proc_thread_self_readlink,
- .follow_link = proc_thread_self_follow_link,
- .put_link = kfree_put_link,
+ .get_link = proc_thread_self_get_link,
};
static unsigned thread_self_inum;
@@ -49,7 +52,7 @@ int proc_setup_thread_self(struct super_block *s)
struct pid_namespace *ns = s->s_fs_info;
struct dentry *thread_self;
- mutex_lock(&root_inode->i_mutex);
+ inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
if (thread_self) {
struct inode *inode = new_inode_pseudo(s);
@@ -68,7 +71,7 @@ int proc_setup_thread_self(struct super_block *s)
} else {
thread_self = ERR_PTR(-ENOMEM);
}
- mutex_unlock(&root_inode->i_mutex);
+ inode_unlock(root_inode);
if (IS_ERR(thread_self)) {
pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
return PTR_ERR(thread_self);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4e61388ec03d..8afe10cf7df8 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -231,7 +231,9 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
list_for_each_entry(m, &vmcore_list, list) {
if (*fpos < m->offset + m->size) {
- tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
+ tsz = (size_t)min_t(unsigned long long,
+ m->offset + m->size - *fpos,
+ buflen);
start = m->paddr + *fpos - m->offset;
tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
if (tmp < 0)
@@ -277,12 +279,12 @@ static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (!page)
return VM_FAULT_OOM;
if (!PageUptodate(page)) {
- offset = (loff_t) index << PAGE_CACHE_SHIFT;
+ offset = (loff_t) index << PAGE_SHIFT;
buf = __va((page_to_pfn(page) << PAGE_SHIFT));
rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
if (rc < 0) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
}
SetPageUptodate(page);
@@ -461,7 +463,8 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
if (start < m->offset + m->size) {
u64 paddr = 0;
- tsz = min_t(size_t, m->offset + m->size - start, size);
+ tsz = (size_t)min_t(unsigned long long,
+ m->offset + m->size - start, size);
paddr = m->paddr + start - m->offset;
if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len,
paddr >> PAGE_SHIFT, tsz,
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 8ebd9a334085..3f1190d18991 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -95,9 +95,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
{
struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
- int err = 0;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
struct super_block *sb = mnt_path.dentry->d_sb;
+ int err;
if (sb->s_op->show_devname) {
err = sb->s_op->show_devname(m, mnt_path.dentry);
@@ -131,16 +131,17 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
struct mount *r = real_mount(mnt);
struct super_block *sb = mnt->mnt_sb;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
- int err = 0;
+ int err;
seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
MAJOR(sb->s_dev), MINOR(sb->s_dev));
- if (sb->s_op->show_path)
+ if (sb->s_op->show_path) {
err = sb->s_op->show_path(m, mnt->mnt_root);
- else
+ if (err)
+ goto out;
+ } else {
seq_dentry(m, mnt->mnt_root, " \t\n\\");
- if (err)
- goto out;
+ }
seq_putc(m, ' ');
/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
@@ -168,12 +169,13 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, " - ");
show_type(m, sb);
seq_putc(m, ' ');
- if (sb->s_op->show_devname)
+ if (sb->s_op->show_devname) {
err = sb->s_op->show_devname(m, mnt->mnt_root);
- else
+ if (err)
+ goto out;
+ } else {
mangle(m, r->mnt_devname ? r->mnt_devname : "none");
- if (err)
- goto out;
+ }
seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
err = show_sb_opts(m, sb);
if (err)
@@ -191,12 +193,14 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
struct mount *r = real_mount(mnt);
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
struct super_block *sb = mnt_path.dentry->d_sb;
- int err = 0;
+ int err;
/* device */
if (sb->s_op->show_devname) {
seq_puts(m, "device ");
err = sb->s_op->show_devname(m, mnt_path.dentry);
+ if (err)
+ goto out;
} else {
if (r->mnt_devname) {
seq_puts(m, "device ");
@@ -220,8 +224,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
/* optional statistics */
if (sb->s_op->show_stats) {
seq_putc(m, ' ');
- if (!err)
- err = sb->s_op->show_stats(m, mnt_path.dentry);
+ err = sb->s_op->show_stats(m, mnt_path.dentry);
}
seq_putc(m, '\n');
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d8c439d813ce..45d6110744cb 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -377,7 +377,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
break;
}
- mutex_lock(&d_inode(root)->i_mutex);
+ inode_lock(d_inode(root));
dentry = d_alloc_name(root, name);
if (!dentry)
@@ -397,12 +397,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
list_add(&private->list, &allpstore);
spin_unlock_irqrestore(&allpstore_lock, flags);
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
return 0;
fail_lockedalloc:
- mutex_unlock(&d_inode(root)->i_mutex);
+ inode_unlock(d_inode(root));
kfree(private);
fail_alloc:
iput(inode);
@@ -420,8 +420,8 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
pstore_sb = sb;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = PSTOREFS_MAGIC;
sb->s_op = &pstore_ops;
sb->s_time_gran = 1;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 319c3a60cfa5..bd9812e83461 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -55,8 +55,8 @@ static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
MODULE_PARM_DESC(pmsg_size, "size of user space message log");
-static ulong mem_address;
-module_param(mem_address, ulong, 0400);
+static unsigned long long mem_address;
+module_param(mem_address, ullong, 0400);
MODULE_PARM_DESC(mem_address,
"start of reserved RAM used to store oops/panic logs");
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index c4bcb778886e..3a67cfb142d8 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -316,6 +316,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &qnx4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &qnx4_aops;
qnx4_i(inode)->mmu_private = inode->i_size;
} else {
@@ -364,7 +365,7 @@ static int init_inodecache(void)
qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
sizeof(struct qnx4_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (qnx4_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index e1f37278cf97..144ceda4948e 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -35,9 +35,9 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n)
static unsigned last_entry(struct inode *inode, unsigned long page_nr)
{
unsigned long last_byte = inode->i_size;
- last_byte -= page_nr << PAGE_CACHE_SHIFT;
- if (last_byte > PAGE_CACHE_SIZE)
- last_byte = PAGE_CACHE_SIZE;
+ last_byte -= page_nr << PAGE_SHIFT;
+ if (last_byte > PAGE_SIZE)
+ last_byte = PAGE_SIZE;
return last_byte / QNX6_DIR_ENTRY_SIZE;
}
@@ -47,9 +47,9 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
{
struct qnx6_sb_info *sbi = QNX6_SB(sb);
u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */
- u32 n = s >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); /* in pages */
+ u32 n = s >> (PAGE_SHIFT - sb->s_blocksize_bits); /* in pages */
/* within page */
- u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_CACHE_MASK;
+ u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_MASK;
struct address_space *mapping = sbi->longfile->i_mapping;
struct page *page = read_mapping_page(mapping, n, NULL);
if (IS_ERR(page))
@@ -115,8 +115,8 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
struct qnx6_sb_info *sbi = QNX6_SB(s);
loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
unsigned long npages = dir_pages(inode);
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
- unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
+ unsigned long n = pos >> PAGE_SHIFT;
+ unsigned start = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE;
bool done = false;
ctx->pos = pos;
@@ -131,7 +131,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
if (IS_ERR(page)) {
pr_err("%s(): read failed\n", __func__);
- ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
+ ctx->pos = (n + 1) << PAGE_SHIFT;
return PTR_ERR(page);
}
de = ((struct qnx6_dir_entry *)page_address(page)) + start;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 32d2e1a9774c..1192422a1c56 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -542,8 +542,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
iget_failed(inode);
return ERR_PTR(-EIO);
}
- n = (ino - 1) >> (PAGE_CACHE_SHIFT - QNX6_INODE_SIZE_BITS);
- offs = (ino - 1) & (~PAGE_CACHE_MASK >> QNX6_INODE_SIZE_BITS);
+ n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS);
+ offs = (ino - 1) & (~PAGE_MASK >> QNX6_INODE_SIZE_BITS);
mapping = sbi->inodes->i_mapping;
page = read_mapping_page(mapping, n, NULL);
if (IS_ERR(page)) {
@@ -582,6 +582,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
inode->i_mapping->a_ops = &qnx6_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &qnx6_aops;
} else
init_special_inode(inode, inode->i_mode, 0);
@@ -624,7 +625,7 @@ static int init_inodecache(void)
qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
sizeof(struct qnx6_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (!qnx6_inode_cachep)
return -ENOMEM;
diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h
index d3fb2b698800..f23b5c4a66ad 100644
--- a/fs/qnx6/qnx6.h
+++ b/fs/qnx6/qnx6.h
@@ -128,7 +128,7 @@ extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s,
static inline void qnx6_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ef0d64b2a6d9..ff21980d0119 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -411,6 +411,8 @@ int dquot_acquire(struct dquot *dquot)
ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
if (ret < 0)
goto out_iolock;
+ /* Make sure flags update is visible after dquot has been filled */
+ smp_mb__before_atomic();
set_bit(DQ_READ_B, &dquot->dq_flags);
/* Instantiate dquot if needed */
if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
@@ -427,6 +429,11 @@ int dquot_acquire(struct dquot *dquot)
goto out_iolock;
}
}
+ /*
+ * Make sure flags update is visible after on-disk struct has been
+ * allocated. Paired with smp_rmb() in dqget().
+ */
+ smp_mb__before_atomic();
set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
mutex_unlock(&dqopt->dqio_mutex);
@@ -682,9 +689,9 @@ int dquot_quota_sync(struct super_block *sb, int type)
continue;
if (!sb_has_quota_active(sb, cnt))
continue;
- mutex_lock(&dqopt->files[cnt]->i_mutex);
+ inode_lock(dqopt->files[cnt]);
truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
- mutex_unlock(&dqopt->files[cnt]->i_mutex);
+ inode_unlock(dqopt->files[cnt]);
}
mutex_unlock(&dqopt->dqonoff_mutex);
@@ -887,6 +894,11 @@ we_slept:
goto out;
}
}
+ /*
+ * Make sure following reads see filled structure - paired with
+ * smp_mb__before_atomic() in dquot_acquire().
+ */
+ smp_rmb();
#ifdef CONFIG_QUOTA_DEBUG
BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */
#endif
@@ -1398,7 +1410,7 @@ static int dquot_active(const struct inode *inode)
static int __dquot_initialize(struct inode *inode, int type)
{
int cnt, init_needed = 0;
- struct dquot **dquots, *got[MAXQUOTAS];
+ struct dquot **dquots, *got[MAXQUOTAS] = {};
struct super_block *sb = inode->i_sb;
qsize_t rsv;
int ret = 0;
@@ -1415,7 +1427,6 @@ static int __dquot_initialize(struct inode *inode, int type)
int rc;
struct dquot *dquot;
- got[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
/*
@@ -2031,6 +2042,30 @@ int dquot_commit_info(struct super_block *sb, int type)
}
EXPORT_SYMBOL(dquot_commit_info);
+int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ struct quota_info *dqopt = sb_dqopt(sb);
+ int err;
+
+ mutex_lock(&dqopt->dqonoff_mutex);
+ if (!sb_has_quota_active(sb, qid->type)) {
+ err = -ESRCH;
+ goto out;
+ }
+ if (!dqopt->ops[qid->type]->get_next_id) {
+ err = -ENOSYS;
+ goto out;
+ }
+ mutex_lock(&dqopt->dqio_mutex);
+ err = dqopt->ops[qid->type]->get_next_id(sb, qid);
+ mutex_unlock(&dqopt->dqio_mutex);
+out:
+ mutex_unlock(&dqopt->dqonoff_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL(dquot_get_next_id);
+
/*
* Definitions of diskquota operations.
*/
@@ -2042,6 +2077,7 @@ const struct dquot_operations dquot_operations = {
.write_info = dquot_commit_info,
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
+ .get_next_id = dquot_get_next_id,
};
EXPORT_SYMBOL(dquot_operations);
@@ -2162,12 +2198,12 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
/* If quota was reenabled in the meantime, we have
* nothing to do */
if (!sb_has_quota_loaded(sb, cnt)) {
- mutex_lock(&toputinode[cnt]->i_mutex);
+ inode_lock(toputinode[cnt]);
toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
S_NOATIME | S_NOQUOTA);
truncate_inode_pages(&toputinode[cnt]->i_data,
0);
- mutex_unlock(&toputinode[cnt]->i_mutex);
+ inode_unlock(toputinode[cnt]);
mark_inode_dirty_sync(toputinode[cnt]);
}
mutex_unlock(&dqopt->dqonoff_mutex);
@@ -2258,11 +2294,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
/* We don't want quota and atime on quota files (deadlocks
* possible) Also nobody should write to the file - we use
* special IO operations which ignore the immutable bit. */
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
S_NOQUOTA);
inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
/*
* When S_NOQUOTA is set, remove dquot references as no more
* references can be added
@@ -2305,12 +2341,12 @@ out_file_init:
iput(inode);
out_lock:
if (oldflags != -1) {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* Set the flags back (in the case of accidental quotaon()
* on a wrong file we don't want to mess up the flags) */
inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
inode->i_flags |= oldflags;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
mutex_unlock(&dqopt->dqonoff_mutex);
out_fmt:
@@ -2430,9 +2466,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
struct dentry *dentry;
int error;
- mutex_lock(&d_inode(sb->s_root)->i_mutex);
- dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
- mutex_unlock(&d_inode(sb->s_root)->i_mutex);
+ dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name));
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -2565,6 +2599,27 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
}
EXPORT_SYMBOL(dquot_get_dqblk);
+int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid,
+ struct qc_dqblk *di)
+{
+ struct dquot *dquot;
+ int err;
+
+ if (!sb->dq_op->get_next_id)
+ return -ENOSYS;
+ err = sb->dq_op->get_next_id(sb, qid);
+ if (err < 0)
+ return err;
+ dquot = dqget(sb, *qid);
+ if (IS_ERR(dquot))
+ return PTR_ERR(dquot);
+ do_get_dqblk(dquot, di);
+ dqput(dquot);
+
+ return 0;
+}
+EXPORT_SYMBOL(dquot_get_next_dqblk);
+
#define VFS_QC_MASK \
(QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
@@ -2765,6 +2820,7 @@ const struct quotactl_ops dquot_quotactl_ops = {
.get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
.set_dqblk = dquot_set_dqblk
};
EXPORT_SYMBOL(dquot_quotactl_ops);
@@ -2776,6 +2832,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = {
.get_state = dquot_get_state,
.set_info = dquot_set_dqinfo,
.get_dqblk = dquot_get_dqblk,
+ .get_nextdqblk = dquot_get_next_dqblk,
.set_dqblk = dquot_set_dqblk
};
EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
@@ -2924,4 +2981,4 @@ static int __init dquot_init(void)
return 0;
}
-module_init(dquot_init);
+fs_initcall(dquot_init);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index bb2869f5dfd8..d07a2f91d858 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -1,7 +1,5 @@
-
#include <linux/cred.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/quotaops.h>
#include <linux/sched.h>
@@ -105,5 +103,4 @@ static int __init quota_init(void)
"VFS: Failed to create quota netlink interface.\n");
return 0;
};
-
-module_init(quota_init);
+fs_initcall(quota_init);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 3746367098fd..0f10ee9892ce 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -79,7 +79,7 @@ unsigned int qtype_enforce_flag(int type)
return 0;
}
-static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
+static int quota_quotaon(struct super_block *sb, int type, qid_t id,
struct path *path)
{
if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
@@ -222,6 +222,34 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
return 0;
}
+/*
+ * Return quota for next active quota >= this id, if any exists,
+ * otherwise return -ENOENT via ->get_nextdqblk
+ */
+static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
+{
+ struct kqid qid;
+ struct qc_dqblk fdq;
+ struct if_nextdqblk idq;
+ int ret;
+
+ if (!sb->s_qcop->get_nextdqblk)
+ return -ENOSYS;
+ qid = make_kqid(current_user_ns(), type, id);
+ if (!qid_valid(qid))
+ return -EINVAL;
+ ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
+ if (ret)
+ return ret;
+ /* struct if_nextdqblk is a superset of struct if_dqblk */
+ copy_to_if_dqblk((struct if_dqblk *)&idq, &fdq);
+ idq.dqb_id = from_kqid(current_user_ns(), qid);
+ if (copy_to_user(addr, &idq, sizeof(idq)))
+ return -EFAULT;
+ return 0;
+}
+
static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
{
dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
@@ -625,6 +653,34 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
return ret;
}
+/*
+ * Return quota for next active quota >= this id, if any exists,
+ * otherwise return -ENOENT via ->get_nextdqblk.
+ */
+static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
+ void __user *addr)
+{
+ struct fs_disk_quota fdq;
+ struct qc_dqblk qdq;
+ struct kqid qid;
+ qid_t id_out;
+ int ret;
+
+ if (!sb->s_qcop->get_nextdqblk)
+ return -ENOSYS;
+ qid = make_kqid(current_user_ns(), type, id);
+ if (!qid_valid(qid))
+ return -EINVAL;
+ ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
+ if (ret)
+ return ret;
+ id_out = from_kqid(current_user_ns(), qid);
+ copy_to_xfs_dqblk(&fdq, &qdq, type, id_out);
+ if (copy_to_user(addr, &fdq, sizeof(fdq)))
+ return -EFAULT;
+ return ret;
+}
+
static int quota_rmxquota(struct super_block *sb, void __user *addr)
{
__u32 flags;
@@ -659,7 +715,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
switch (cmd) {
case Q_QUOTAON:
- return quota_quotaon(sb, type, cmd, id, path);
+ return quota_quotaon(sb, type, id, path);
case Q_QUOTAOFF:
return quota_quotaoff(sb, type);
case Q_GETFMT:
@@ -670,6 +726,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
return quota_setinfo(sb, type, addr);
case Q_GETQUOTA:
return quota_getquota(sb, type, id, addr);
+ case Q_GETNEXTQUOTA:
+ return quota_getnextquota(sb, type, id, addr);
case Q_SETQUOTA:
return quota_setquota(sb, type, id, addr);
case Q_SYNC:
@@ -690,6 +748,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
return quota_setxquota(sb, type, id, addr);
case Q_XGETQUOTA:
return quota_getxquota(sb, type, id, addr);
+ case Q_XGETNEXTQUOTA:
+ return quota_getnextxquota(sb, type, id, addr);
case Q_XQUOTASYNC:
if (sb->s_flags & MS_RDONLY)
return -EROFS;
@@ -705,6 +765,11 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
/* Return 1 if 'cmd' will block on frozen filesystem */
static int quotactl_cmd_write(int cmd)
{
+ /*
+ * We cannot allow Q_GETQUOTA and Q_GETNEXTQUOTA without write access
+ * as dquot_acquire() may allocate space for new structure and OCFS2
+ * needs to increment on-disk use count.
+ */
switch (cmd) {
case Q_GETFMT:
case Q_GETINFO:
@@ -712,6 +777,7 @@ static int quotactl_cmd_write(int cmd)
case Q_XGETQSTAT:
case Q_XGETQSTATV:
case Q_XGETQUOTA:
+ case Q_XGETNEXTQUOTA:
case Q_XQUOTASYNC:
return 0;
}
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 58efb83dec1c..0738972e8d3f 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -22,10 +22,9 @@ MODULE_LICENSE("GPL");
#define __QUOTA_QT_PARANOIA
-static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
+static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
{
unsigned int epb = info->dqi_usable_bs >> 2;
- qid_t id = from_kqid(&init_user_ns, qid);
depth = info->dqi_qtree_depth - depth - 1;
while (depth--)
@@ -33,6 +32,13 @@ static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
return id % epb;
}
+static int get_index(struct qtree_mem_dqinfo *info, struct kqid qid, int depth)
+{
+ qid_t id = from_kqid(&init_user_ns, qid);
+
+ return __get_index(info, id, depth);
+}
+
/* Number of entries in one blocks */
static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
{
@@ -668,3 +674,60 @@ int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
return 0;
}
EXPORT_SYMBOL(qtree_release_dquot);
+
+static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
+ unsigned int blk, int depth)
+{
+ char *buf = getdqbuf(info->dqi_usable_bs);
+ __le32 *ref = (__le32 *)buf;
+ ssize_t ret;
+ unsigned int epb = info->dqi_usable_bs >> 2;
+ unsigned int level_inc = 1;
+ int i;
+
+ if (!buf)
+ return -ENOMEM;
+
+ for (i = depth; i < info->dqi_qtree_depth - 1; i++)
+ level_inc *= epb;
+
+ ret = read_blk(info, blk, buf);
+ if (ret < 0) {
+ quota_error(info->dqi_sb,
+ "Can't read quota tree block %u", blk);
+ goto out_buf;
+ }
+ for (i = __get_index(info, *id, depth); i < epb; i++) {
+ if (ref[i] == cpu_to_le32(0)) {
+ *id += level_inc;
+ continue;
+ }
+ if (depth == info->dqi_qtree_depth - 1) {
+ ret = 0;
+ goto out_buf;
+ }
+ ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1);
+ if (ret != -ENOENT)
+ break;
+ }
+ if (i == epb) {
+ ret = -ENOENT;
+ goto out_buf;
+ }
+out_buf:
+ kfree(buf);
+ return ret;
+}
+
+int qtree_get_next_id(struct qtree_mem_dqinfo *info, struct kqid *qid)
+{
+ qid_t id = from_kqid(&init_user_ns, *qid);
+ int ret;
+
+ ret = find_next_id(info, &id, QT_TREEOFF, 0);
+ if (ret < 0)
+ return ret;
+ *qid = make_kqid(&init_user_ns, qid->type, id);
+ return 0;
+}
+EXPORT_SYMBOL(qtree_get_next_id);
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 2aa012a68e90..ca71bf881ad1 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -30,13 +30,13 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
static int v2r1_is_id(void *dp, struct dquot *dquot);
-static struct qtree_fmt_operations v2r0_qtree_ops = {
+static const struct qtree_fmt_operations v2r0_qtree_ops = {
.mem2disk_dqblk = v2r0_mem2diskdqb,
.disk2mem_dqblk = v2r0_disk2memdqb,
.is_id = v2r0_is_id,
};
-static struct qtree_fmt_operations v2r1_qtree_ops = {
+static const struct qtree_fmt_operations v2r1_qtree_ops = {
.mem2disk_dqblk = v2r1_mem2diskdqb,
.disk2mem_dqblk = v2r1_disk2memdqb,
.is_id = v2r1_is_id,
@@ -304,6 +304,11 @@ static int v2_free_file_info(struct super_block *sb, int type)
return 0;
}
+static int v2_get_next_id(struct super_block *sb, struct kqid *qid)
+{
+ return qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid);
+}
+
static const struct quota_format_ops v2_format_ops = {
.check_quota_file = v2_check_quota_file,
.read_file_info = v2_read_file_info,
@@ -312,6 +317,7 @@ static const struct quota_format_ops v2_format_ops = {
.read_dqblk = v2_read_dquot,
.commit_dqblk = v2_write_dquot,
.release_dqblk = v2_release_dquot,
+ .get_next_id = v2_get_next_id,
};
static struct quota_format_type v2r0_quota_format = {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 889d558b4e05..1ab6e6c2e60e 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -79,6 +79,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
break;
}
}
@@ -222,8 +223,8 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent)
return err;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = RAMFS_MAGIC;
sb->s_op = &ramfs_ops;
sb->s_time_gran = 1;
diff --git a/fs/read_write.c b/fs/read_write.c
index 819ef3faf1bb..cf377cf9dfe3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,8 @@
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
#include "internal.h"
#include <asm/uaccess.h>
@@ -171,6 +173,45 @@ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t si
EXPORT_SYMBOL(fixed_size_llseek);
/**
+ * no_seek_end_llseek - llseek implementation for fixed-sized devices
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ *
+ */
+loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
+{
+ switch (whence) {
+ case SEEK_SET: case SEEK_CUR:
+ return generic_file_llseek_size(file, offset, whence,
+ OFFSET_MAX, 0);
+ default:
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(no_seek_end_llseek);
+
+/**
+ * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @whence: type of seek
+ * @size: maximal offset allowed
+ *
+ */
+loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
+{
+ switch (whence) {
+ case SEEK_SET: case SEEK_CUR:
+ return generic_file_llseek_size(file, offset, whence,
+ size, 0);
+ default:
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(no_seek_end_llseek_size);
+
+/**
* noop_llseek - No Operation Performed llseek implementation
* @file: file structure to seek on
* @offset: file offset to seek to
@@ -198,7 +239,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file_inode(file);
loff_t retval;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
switch (whence) {
case SEEK_END:
offset += i_size_read(inode);
@@ -243,7 +284,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
retval = offset;
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return retval;
}
EXPORT_SYMBOL(default_llseek);
@@ -395,9 +436,8 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
}
if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
- retval = locks_mandatory_area(
- read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
- inode, file, pos, count);
+ retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
+ read_write == READ ? F_RDLCK : F_WRLCK);
if (retval < 0)
return retval;
}
@@ -653,12 +693,17 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
EXPORT_SYMBOL(iov_shorten);
static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
- loff_t *ppos, iter_fn_t fn)
+ loff_t *ppos, iter_fn_t fn, int flags)
{
struct kiocb kiocb;
ssize_t ret;
+ if (flags & ~RWF_HIPRI)
+ return -EOPNOTSUPP;
+
init_sync_kiocb(&kiocb, filp);
+ if (flags & RWF_HIPRI)
+ kiocb.ki_flags |= IOCB_HIPRI;
kiocb.ki_pos = *ppos;
ret = fn(&kiocb, iter);
@@ -669,10 +714,13 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
- loff_t *ppos, io_fn_t fn)
+ loff_t *ppos, io_fn_t fn, int flags)
{
ssize_t ret = 0;
+ if (flags & ~RWF_HIPRI)
+ return -EOPNOTSUPP;
+
while (iov_iter_count(iter)) {
struct iovec iovec = iov_iter_iovec(iter);
ssize_t nr;
@@ -773,7 +821,8 @@ out:
static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec __user * uvector,
- unsigned long nr_segs, loff_t *pos)
+ unsigned long nr_segs, loff_t *pos,
+ int flags)
{
size_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
@@ -805,9 +854,9 @@ static ssize_t do_readv_writev(int type, struct file *file,
}
if (iter_fn)
- ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
+ ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
else
- ret = do_loop_readv_writev(file, &iter, pos, fn);
+ ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
if (type != READ)
file_end_write(file);
@@ -824,40 +873,40 @@ out:
}
ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos)
+ unsigned long vlen, loff_t *pos, int flags)
{
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
- return do_readv_writev(READ, file, vec, vlen, pos);
+ return do_readv_writev(READ, file, vec, vlen, pos, flags);
}
EXPORT_SYMBOL(vfs_readv);
ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos)
+ unsigned long vlen, loff_t *pos, int flags)
{
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
- return do_readv_writev(WRITE, file, vec, vlen, pos);
+ return do_readv_writev(WRITE, file, vec, vlen, pos, flags);
}
EXPORT_SYMBOL(vfs_writev);
-SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen)
+static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file);
- ret = vfs_readv(f.file, vec, vlen, &pos);
+ ret = vfs_readv(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
@@ -869,15 +918,15 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}
-SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen)
+static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file);
- ret = vfs_writev(f.file, vec, vlen, &pos);
+ ret = vfs_writev(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
@@ -895,10 +944,9 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
}
-SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, loff_t pos, int flags)
{
- loff_t pos = pos_from_hilo(pos_h, pos_l);
struct fd f;
ssize_t ret = -EBADF;
@@ -909,7 +957,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PREAD)
- ret = vfs_readv(f.file, vec, vlen, &pos);
+ ret = vfs_readv(f.file, vec, vlen, &pos, flags);
fdput(f);
}
@@ -919,10 +967,9 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}
-SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
- unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
+ unsigned long vlen, loff_t pos, int flags)
{
- loff_t pos = pos_from_hilo(pos_h, pos_l);
struct fd f;
ssize_t ret = -EBADF;
@@ -933,7 +980,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PWRITE)
- ret = vfs_writev(f.file, vec, vlen, &pos);
+ ret = vfs_writev(f.file, vec, vlen, &pos, flags);
fdput(f);
}
@@ -943,11 +990,64 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
return ret;
}
+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen)
+{
+ return do_readv(fd, vec, vlen, 0);
+}
+
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen)
+{
+ return do_writev(fd, vec, vlen, 0);
+}
+
+SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ return do_preadv(fd, vec, vlen, pos, 0);
+}
+
+SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+ int, flags)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ if (pos == -1)
+ return do_readv(fd, vec, vlen, flags);
+
+ return do_preadv(fd, vec, vlen, pos, flags);
+}
+
+SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ return do_pwritev(fd, vec, vlen, pos, 0);
+}
+
+SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
+ unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
+ int, flags)
+{
+ loff_t pos = pos_from_hilo(pos_h, pos_l);
+
+ if (pos == -1)
+ return do_writev(fd, vec, vlen, flags);
+
+ return do_pwritev(fd, vec, vlen, pos, flags);
+}
+
#ifdef CONFIG_COMPAT
static ssize_t compat_do_readv_writev(int type, struct file *file,
const struct compat_iovec __user *uvector,
- unsigned long nr_segs, loff_t *pos)
+ unsigned long nr_segs, loff_t *pos,
+ int flags)
{
compat_ssize_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
@@ -979,9 +1079,9 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
}
if (iter_fn)
- ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
+ ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
else
- ret = do_loop_readv_writev(file, &iter, pos, fn);
+ ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
if (type != READ)
file_end_write(file);
@@ -999,7 +1099,7 @@ out:
static size_t compat_readv(struct file *file,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos)
+ unsigned long vlen, loff_t *pos, int flags)
{
ssize_t ret = -EBADF;
@@ -1010,7 +1110,7 @@ static size_t compat_readv(struct file *file,
if (!(file->f_mode & FMODE_CAN_READ))
goto out;
- ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
+ ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
out:
if (ret > 0)
@@ -1019,9 +1119,9 @@ out:
return ret;
}
-COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
- compat_ulong_t, vlen)
+static size_t do_compat_readv(compat_ulong_t fd,
+ const struct compat_iovec __user *vec,
+ compat_ulong_t vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret;
@@ -1030,16 +1130,24 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
if (!f.file)
return -EBADF;
pos = f.file->f_pos;
- ret = compat_readv(f.file, vec, vlen, &pos);
+ ret = compat_readv(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
f.file->f_pos = pos;
fdput_pos(f);
return ret;
+
}
-static long __compat_sys_preadv64(unsigned long fd,
+COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
+ const struct compat_iovec __user *,vec,
+ compat_ulong_t, vlen)
+{
+ return do_compat_readv(fd, vec, vlen, 0);
+}
+
+static long do_compat_preadv64(unsigned long fd,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos)
+ unsigned long vlen, loff_t pos, int flags)
{
struct fd f;
ssize_t ret;
@@ -1051,7 +1159,7 @@ static long __compat_sys_preadv64(unsigned long fd,
return -EBADF;
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PREAD)
- ret = compat_readv(f.file, vec, vlen, &pos);
+ ret = compat_readv(f.file, vec, vlen, &pos, flags);
fdput(f);
return ret;
}
@@ -1061,7 +1169,7 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
const struct compat_iovec __user *,vec,
unsigned long, vlen, loff_t, pos)
{
- return __compat_sys_preadv64(fd, vec, vlen, pos);
+ return do_compat_preadv64(fd, vec, vlen, pos, 0);
}
#endif
@@ -1071,12 +1179,25 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return __compat_sys_preadv64(fd, vec, vlen, pos);
+ return do_compat_preadv64(fd, vec, vlen, pos, 0);
+}
+
+COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
+ const struct compat_iovec __user *,vec,
+ compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
+ int, flags)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+
+ if (pos == -1)
+ return do_compat_readv(fd, vec, vlen, flags);
+
+ return do_compat_preadv64(fd, vec, vlen, pos, flags);
}
static size_t compat_writev(struct file *file,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos)
+ unsigned long vlen, loff_t *pos, int flags)
{
ssize_t ret = -EBADF;
@@ -1087,7 +1208,7 @@ static size_t compat_writev(struct file *file,
if (!(file->f_mode & FMODE_CAN_WRITE))
goto out;
- ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
+ ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0);
out:
if (ret > 0)
@@ -1096,9 +1217,9 @@ out:
return ret;
}
-COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
- const struct compat_iovec __user *, vec,
- compat_ulong_t, vlen)
+static size_t do_compat_writev(compat_ulong_t fd,
+ const struct compat_iovec __user* vec,
+ compat_ulong_t vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret;
@@ -1107,16 +1228,23 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
if (!f.file)
return -EBADF;
pos = f.file->f_pos;
- ret = compat_writev(f.file, vec, vlen, &pos);
+ ret = compat_writev(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
f.file->f_pos = pos;
fdput_pos(f);
return ret;
}
-static long __compat_sys_pwritev64(unsigned long fd,
+COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
+ const struct compat_iovec __user *, vec,
+ compat_ulong_t, vlen)
+{
+ return do_compat_writev(fd, vec, vlen, 0);
+}
+
+static long do_compat_pwritev64(unsigned long fd,
const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos)
+ unsigned long vlen, loff_t pos, int flags)
{
struct fd f;
ssize_t ret;
@@ -1128,7 +1256,7 @@ static long __compat_sys_pwritev64(unsigned long fd,
return -EBADF;
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PWRITE)
- ret = compat_writev(f.file, vec, vlen, &pos);
+ ret = compat_writev(f.file, vec, vlen, &pos, flags);
fdput(f);
return ret;
}
@@ -1138,7 +1266,7 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
const struct compat_iovec __user *,vec,
unsigned long, vlen, loff_t, pos)
{
- return __compat_sys_pwritev64(fd, vec, vlen, pos);
+ return do_compat_pwritev64(fd, vec, vlen, pos, 0);
}
#endif
@@ -1148,8 +1276,21 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return __compat_sys_pwritev64(fd, vec, vlen, pos);
+ return do_compat_pwritev64(fd, vec, vlen, pos, 0);
}
+
+COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
+ const struct compat_iovec __user *,vec,
+ compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags)
+{
+ loff_t pos = ((loff_t)pos_high << 32) | pos_low;
+
+ if (pos == -1)
+ return do_compat_writev(fd, vec, vlen, flags);
+
+ return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+}
+
#endif
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
@@ -1327,3 +1468,304 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif
+
+/*
+ * copy_file_range() differs from regular file read and write in that it
+ * specifically allows return partial success. When it does so is up to
+ * the copy_file_range method.
+ */
+ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ ssize_t ret;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */
+ ret = rw_verify_area(READ, file_in, &pos_in, len);
+ if (ret >= 0)
+ ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+ if (ret < 0)
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND))
+ return -EBADF;
+
+ /* this could be relaxed once a method supports cross-fs copies */
+ if (inode_in->i_sb != inode_out->i_sb)
+ return -EXDEV;
+
+ if (len == 0)
+ return 0;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = -EOPNOTSUPP;
+ if (file_out->f_op->copy_file_range)
+ ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
+ pos_out, len, flags);
+ if (ret == -EOPNOTSUPP)
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+ len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+ inc_syscr(current);
+ inc_syscw(current);
+
+ mnt_drop_write_file(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range);
+
+SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
+ int, fd_out, loff_t __user *, off_out,
+ size_t, len, unsigned int, flags)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ ssize_t ret = -EBADF;
+
+ f_in = fdget(fd_in);
+ if (!f_in.file)
+ goto out2;
+
+ f_out = fdget(fd_out);
+ if (!f_out.file)
+ goto out1;
+
+ ret = -EFAULT;
+ if (off_in) {
+ if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_in = f_in.file->f_pos;
+ }
+
+ if (off_out) {
+ if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_out = f_out.file->f_pos;
+ }
+
+ ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+ flags);
+ if (ret > 0) {
+ pos_in += ret;
+ pos_out += ret;
+
+ if (off_in) {
+ if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_in.file->f_pos = pos_in;
+ }
+
+ if (off_out) {
+ if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_out.file->f_pos = pos_out;
+ }
+ }
+
+out:
+ fdput(f_out);
+out1:
+ fdput(f_in);
+out2:
+ return ret;
+}
+
+static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+{
+ struct inode *inode = file_inode(file);
+
+ if (unlikely(pos < 0))
+ return -EINVAL;
+
+ if (unlikely((loff_t) (pos + len) < 0))
+ return -EINVAL;
+
+ if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
+ loff_t end = len ? pos + len - 1 : OFFSET_MAX;
+ int retval;
+
+ retval = locks_mandatory_area(inode, file, pos, end,
+ write ? F_WRLCK : F_RDLCK);
+ if (retval < 0)
+ return retval;
+ }
+
+ return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
+}
+
+int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ int ret;
+
+ if (inode_in->i_sb != inode_out->i_sb ||
+ file_in->f_path.mnt != file_out->f_path.mnt)
+ return -EXDEV;
+
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ return -EINVAL;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND))
+ return -EBADF;
+
+ if (!file_in->f_op->clone_file_range)
+ return -EOPNOTSUPP;
+
+ ret = clone_verify_area(file_in, pos_in, len, false);
+ if (ret)
+ return ret;
+
+ ret = clone_verify_area(file_out, pos_out, len, true);
+ if (ret)
+ return ret;
+
+ if (pos_in + len > i_size_read(inode_in))
+ return -EINVAL;
+
+ ret = mnt_want_write_file(file_out);
+ if (ret)
+ return ret;
+
+ ret = file_in->f_op->clone_file_range(file_in, pos_in,
+ file_out, pos_out, len);
+ if (!ret) {
+ fsnotify_access(file_in);
+ fsnotify_modify(file_out);
+ }
+
+ mnt_drop_write_file(file_out);
+ return ret;
+}
+EXPORT_SYMBOL(vfs_clone_file_range);
+
+int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
+{
+ struct file_dedupe_range_info *info;
+ struct inode *src = file_inode(file);
+ u64 off;
+ u64 len;
+ int i;
+ int ret;
+ bool is_admin = capable(CAP_SYS_ADMIN);
+ u16 count = same->dest_count;
+ struct file *dst_file;
+ loff_t dst_off;
+ ssize_t deduped;
+
+ if (!(file->f_mode & FMODE_READ))
+ return -EINVAL;
+
+ if (same->reserved1 || same->reserved2)
+ return -EINVAL;
+
+ off = same->src_offset;
+ len = same->src_length;
+
+ ret = -EISDIR;
+ if (S_ISDIR(src->i_mode))
+ goto out;
+
+ ret = -EINVAL;
+ if (!S_ISREG(src->i_mode))
+ goto out;
+
+ ret = clone_verify_area(file, off, len, false);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+ /* pre-format output fields to sane values */
+ for (i = 0; i < count; i++) {
+ same->info[i].bytes_deduped = 0ULL;
+ same->info[i].status = FILE_DEDUPE_RANGE_SAME;
+ }
+
+ for (i = 0, info = same->info; i < count; i++, info++) {
+ struct inode *dst;
+ struct fd dst_fd = fdget(info->dest_fd);
+
+ dst_file = dst_fd.file;
+ if (!dst_file) {
+ info->status = -EBADF;
+ goto next_loop;
+ }
+ dst = file_inode(dst_file);
+
+ ret = mnt_want_write_file(dst_file);
+ if (ret) {
+ info->status = ret;
+ goto next_loop;
+ }
+
+ dst_off = info->dest_offset;
+ ret = clone_verify_area(dst_file, dst_off, len, true);
+ if (ret < 0) {
+ info->status = ret;
+ goto next_file;
+ }
+ ret = 0;
+
+ if (info->reserved) {
+ info->status = -EINVAL;
+ } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
+ info->status = -EINVAL;
+ } else if (file->f_path.mnt != dst_file->f_path.mnt) {
+ info->status = -EXDEV;
+ } else if (S_ISDIR(dst->i_mode)) {
+ info->status = -EISDIR;
+ } else if (dst_file->f_op->dedupe_file_range == NULL) {
+ info->status = -EINVAL;
+ } else {
+ deduped = dst_file->f_op->dedupe_file_range(file, off,
+ len, dst_file,
+ info->dest_offset);
+ if (deduped == -EBADE)
+ info->status = FILE_DEDUPE_RANGE_DIFFERS;
+ else if (deduped < 0)
+ info->status = deduped;
+ else
+ info->bytes_deduped += deduped;
+ }
+
+next_file:
+ mnt_drop_write_file(dst_file);
+next_loop:
+ fdput(dst_fd);
+
+ if (fatal_signal_pending(current))
+ goto out;
+ }
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --git a/fs/readdir.c b/fs/readdir.c
index ced679179cac..e69ef3b79787 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -44,7 +44,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
fsnotify_access(file);
file_accessed(file);
}
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out:
return res;
}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 4a024e2ceb9f..3abd4004184b 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -38,11 +38,11 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
reiserfs_write_lock(inode->i_sb);
err = reiserfs_commit_for_inode(inode);
reiserfs_write_unlock(inode->i_sb);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (err < 0)
return err;
return 0;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 96a1bcf33db4..389773711de4 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -158,7 +158,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
BUG_ON(!S_ISREG(inode->i_mode));
err = sync_mapping_buffers(inode->i_mapping);
reiserfs_write_lock(inode->i_sb);
@@ -166,7 +166,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
reiserfs_write_unlock(inode->i_sb);
if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (barrier_done < 0)
return barrier_done;
return (err < 0) ? -EIO : 0;
@@ -180,11 +180,11 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
int partial = 0;
unsigned blocksize;
struct buffer_head *bh, *head;
- unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ unsigned long i_size_index = inode->i_size >> PAGE_SHIFT;
int new;
int logit = reiserfs_file_data_log(inode);
struct super_block *s = inode->i_sb;
- int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+ int bh_per_page = PAGE_SIZE / s->s_blocksize;
struct reiserfs_transaction_handle th;
int ret = 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 3d8e7e671d5b..d5c2e9c865de 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -386,7 +386,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
goto finished;
}
/* read file tail into part of page */
- offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
+ offset = (cpu_key_k_offset(&key) - 1) & (PAGE_SIZE - 1);
copy_item_head(&tmp_ih, ih);
/*
@@ -587,10 +587,10 @@ static int convert_tail_for_hole(struct inode *inode,
return -EIO;
/* always try to read until the end of the block */
- tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
+ tail_start = tail_offset & (PAGE_SIZE - 1);
tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
- index = tail_offset >> PAGE_CACHE_SHIFT;
+ index = tail_offset >> PAGE_SHIFT;
/*
* hole_page can be zero in case of direct_io, we are sure
* that we cannot get here if we write with O_DIRECT into tail page
@@ -629,7 +629,7 @@ static int convert_tail_for_hole(struct inode *inode,
unlock:
if (tail_page != hole_page) {
unlock_page(tail_page);
- page_cache_release(tail_page);
+ put_page(tail_page);
}
out:
return retval;
@@ -1361,6 +1361,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode->i_fop = &reiserfs_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &reiserfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
} else {
inode->i_blocks = 0;
@@ -2188,11 +2189,11 @@ static int grab_tail_page(struct inode *inode,
* we want the page with the last byte in the file,
* not the page that will hold the next byte for appending
*/
- unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long index = (inode->i_size - 1) >> PAGE_SHIFT;
unsigned long pos = 0;
unsigned long start = 0;
unsigned long blocksize = inode->i_sb->s_blocksize;
- unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
+ unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1);
struct buffer_head *bh;
struct buffer_head *head;
struct page *page;
@@ -2250,7 +2251,7 @@ out:
unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return error;
}
@@ -2264,7 +2265,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
{
struct reiserfs_transaction_handle th;
/* we want the offset for the first byte after the end of the file */
- unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ unsigned long offset = inode->i_size & (PAGE_SIZE - 1);
unsigned blocksize = inode->i_sb->s_blocksize;
unsigned length;
struct page *page = NULL;
@@ -2344,7 +2345,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
}
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
reiserfs_write_unlock(inode->i_sb);
@@ -2353,7 +2354,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
out:
if (page) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
reiserfs_write_unlock(inode->i_sb);
@@ -2425,7 +2426,7 @@ research:
} else if (is_direct_le_ih(ih)) {
char *p;
p = page_address(bh_result->b_page);
- p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
+ p += (byte_offset - 1) & (PAGE_SIZE - 1);
copy_size = ih_item_len(ih) - pos_in_item;
fs_gen = get_generation(inode->i_sb);
@@ -2524,7 +2525,7 @@ static int reiserfs_write_full_page(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = inode->i_size >> PAGE_SHIFT;
int error = 0;
unsigned long block;
sector_t last_block;
@@ -2534,7 +2535,7 @@ static int reiserfs_write_full_page(struct page *page,
int checked = PageChecked(page);
struct reiserfs_transaction_handle th;
struct super_block *s = inode->i_sb;
- int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+ int bh_per_page = PAGE_SIZE / s->s_blocksize;
th.t_trans_id = 0;
/* no logging allowed when nonblocking or from PF_MEMALLOC */
@@ -2563,16 +2564,16 @@ static int reiserfs_write_full_page(struct page *page,
if (page->index >= end_index) {
unsigned last_offset;
- last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+ last_offset = inode->i_size & (PAGE_SIZE - 1);
/* no file contents in this page */
if (page->index >= end_index + 1 || !last_offset) {
unlock_page(page);
return 0;
}
- zero_user_segment(page, last_offset, PAGE_CACHE_SIZE);
+ zero_user_segment(page, last_offset, PAGE_SIZE);
}
bh = head;
- block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
+ block = page->index << (PAGE_SHIFT - s->s_blocksize_bits);
last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
/* first map all the buffers, logging any direct items we find */
do {
@@ -2773,7 +2774,7 @@ static int reiserfs_write_begin(struct file *file,
*fsdata = (void *)(unsigned long)flags;
}
- index = pos >> PAGE_CACHE_SHIFT;
+ index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -2821,7 +2822,7 @@ static int reiserfs_write_begin(struct file *file,
}
if (ret) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
/* Truncate allocated blocks */
reiserfs_truncate_failed_write(inode);
}
@@ -2908,7 +2909,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
else
th = NULL;
- start = pos & (PAGE_CACHE_SIZE - 1);
+ start = pos & (PAGE_SIZE - 1);
if (unlikely(copied < len)) {
if (!PageUptodate(page))
copied = 0;
@@ -2973,7 +2974,7 @@ out:
if (locked)
reiserfs_write_unlock(inode->i_sb);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (pos + len > inode->i_size)
reiserfs_truncate_failed_write(inode);
@@ -2995,7 +2996,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
- loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
+ loff_t pos = ((loff_t) page->index << PAGE_SHIFT) + to;
int ret = 0;
int update_sd = 0;
struct reiserfs_transaction_handle *th = NULL;
@@ -3180,7 +3181,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
struct inode *inode = page->mapping->host;
unsigned int curr_off = 0;
unsigned int stop = offset + length;
- int partial_page = (offset || length < PAGE_CACHE_SIZE);
+ int partial_page = (offset || length < PAGE_SIZE);
int ret = 1;
BUG_ON(!PageLocked(page));
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 6ec8a30a0911..57045f423893 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -203,7 +203,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
* __reiserfs_write_begin on that page. This will force a
* reiserfs_get_block to unpack the tail for us.
*/
- index = inode->i_size >> PAGE_CACHE_SHIFT;
+ index = inode->i_size >> PAGE_SHIFT;
mapping = inode->i_mapping;
page = grab_cache_page(mapping, index);
retval = -ENOMEM;
@@ -221,10 +221,10 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
out_unlock:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
reiserfs_write_unlock(inode->i_sb);
return retval;
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9d6486d416a3..2ace90e981f0 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -599,18 +599,18 @@ static int journal_list_still_alive(struct super_block *s,
* This does a check to see if the buffer belongs to one of these
* lost pages before doing the final put_bh. If page->mapping was
* null, it tries to free buffers on the page, which should make the
- * final page_cache_release drop the page from the lru.
+ * final put_page drop the page from the lru.
*/
static void release_buffer_page(struct buffer_head *bh)
{
struct page *page = bh->b_page;
if (!page->mapping && trylock_page(page)) {
- page_cache_get(page);
+ get_page(page);
put_bh(bh);
if (!page->mapping)
try_to_free_buffers(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
} else {
put_bh(bh);
}
@@ -618,12 +618,10 @@ static void release_buffer_page(struct buffer_head *bh)
static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
- char b[BDEVNAME_SIZE];
-
if (buffer_journaled(bh)) {
reiserfs_warning(NULL, "clm-2084",
- "pinned buffer %lu:%s sent to disk",
- bh->b_blocknr, bdevname(bh->b_bdev, b));
+ "pinned buffer %lu:%pg sent to disk",
+ bh->b_blocknr, bh->b_bdev);
}
if (uptodate)
set_buffer_uptodate(bh);
@@ -2387,11 +2385,10 @@ static int journal_read(struct super_block *sb)
int replay_count = 0;
int continue_replay = 1;
int ret;
- char b[BDEVNAME_SIZE];
cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
- reiserfs_info(sb, "checking transaction log (%s)\n",
- bdevname(journal->j_dev_bd, b));
+ reiserfs_info(sb, "checking transaction log (%pg)\n",
+ journal->j_dev_bd);
start = get_seconds();
/*
@@ -2651,8 +2648,8 @@ static int journal_init_dev(struct super_block *super,
set_blocksize(journal->j_dev_bd, super->s_blocksize);
reiserfs_info(super,
- "journal_init_dev: journal device: %s\n",
- bdevname(journal->j_dev_bd, b));
+ "journal_init_dev: journal device: %pg\n",
+ journal->j_dev_bd);
return 0;
}
@@ -2724,7 +2721,6 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
struct reiserfs_journal_header *jh;
struct reiserfs_journal *journal;
struct reiserfs_journal_list *jl;
- char b[BDEVNAME_SIZE];
int ret;
journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
@@ -2794,10 +2790,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
&& (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
sb_jp_journal_magic(rs))) {
reiserfs_warning(sb, "sh-460",
- "journal header magic %x (device %s) does "
+ "journal header magic %x (device %pg) does "
"not match to magic found in super block %x",
jh->jh_journal.jp_journal_magic,
- bdevname(journal->j_dev_bd, b),
+ journal->j_dev_bd,
sb_jp_journal_magic(rs));
brelse(bhjh);
goto free_and_return;
@@ -2818,10 +2814,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
journal->j_max_trans_age = commit_max_age;
}
- reiserfs_info(sb, "journal params: device %s, size %u, "
+ reiserfs_info(sb, "journal params: device %pg, size %u, "
"journal first block %u, max trans len %u, max batch %u, "
"max commit age %u, max trans age %u\n",
- bdevname(journal->j_dev_bd, b),
+ journal->j_dev_bd,
SB_ONDISK_JOURNAL_SIZE(sb),
SB_ONDISK_JOURNAL_1st_BLOCK(sb),
journal->j_trans_max,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 5f1c9c29eb8c..2a12d46d7fb4 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -712,9 +712,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
retval = dquot_initialize(dir);
if (retval)
return retval;
@@ -1173,6 +1170,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
reiserfs_update_inode_transaction(parent_dir);
inode->i_op = &reiserfs_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
@@ -1667,8 +1665,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
*/
const struct inode_operations reiserfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.setattr = reiserfs_setattr,
.setxattr = reiserfs_setxattr,
.getxattr = reiserfs_getxattr,
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index ae1dc841db3a..4f3f928076f3 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -139,11 +139,9 @@ static void sprintf_block_head(char *buf, struct buffer_head *bh)
static void sprintf_buffer_head(char *buf, struct buffer_head *bh)
{
- char b[BDEVNAME_SIZE];
-
sprintf(buf,
- "dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
- bdevname(bh->b_bdev, b), bh->b_size,
+ "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
+ bh->b_bdev, bh->b_size,
(unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)),
bh->b_state, bh->b_page,
buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
@@ -530,7 +528,6 @@ static int print_super_block(struct buffer_head *bh)
(struct reiserfs_super_block *)(bh->b_data);
int skipped, data_blocks;
char *version;
- char b[BDEVNAME_SIZE];
if (is_reiserfs_3_5(rs)) {
version = "3.5";
@@ -543,7 +540,7 @@ static int print_super_block(struct buffer_head *bh)
return 1;
}
- printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b),
+ printk("%pg\'s super block is in block %llu\n", bh->b_bdev,
(unsigned long long)bh->b_blocknr);
printk("Reiserfs version %s\n", version);
printk("Block count %u\n", sb_block_count(rs));
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 621b9f381fe1..fe999157dd97 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -303,11 +303,10 @@ static int show_journal(struct seq_file *m, void *unused)
struct reiserfs_sb_info *r = REISERFS_SB(sb);
struct reiserfs_super_block *rs = r->s_rs;
struct journal_params *jp = &rs->s_v1.s_journal;
- char b[BDEVNAME_SIZE];
seq_printf(m, /* on-disk fields */
"jp_journal_1st_block: \t%i\n"
- "jp_journal_dev: \t%s[%x]\n"
+ "jp_journal_dev: \t%pg[%x]\n"
"jp_journal_size: \t%i\n"
"jp_journal_trans_max: \t%i\n"
"jp_journal_magic: \t%i\n"
@@ -348,7 +347,7 @@ static int show_journal(struct seq_file *m, void *unused)
"prepare: \t%12lu\n"
"prepare_retry: \t%12lu\n",
DJP(jp_journal_1st_block),
- bdevname(SB_JOURNAL(sb)->j_dev_bd, b),
+ SB_JOURNAL(sb)->j_dev_bd,
DJP(jp_journal_dev),
DJP(jp_journal_size),
DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 24cbe013240f..5feacd689241 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1342,7 +1342,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
*/
data = kmap_atomic(un_bh->b_page);
- off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
+ off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1));
memcpy(data + off,
ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
ret_value);
@@ -1511,7 +1511,7 @@ static void unmap_buffers(struct page *page, loff_t pos)
if (page) {
if (page_has_buffers(page)) {
- tail_index = pos & (PAGE_CACHE_SIZE - 1);
+ tail_index = pos & (PAGE_SIZE - 1);
cur_index = 0;
head = page_buffers(page);
bh = head;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 4a62fe8cc3bf..b8f2d1e8c645 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s)
pathrelse(&path);
inode = reiserfs_iget(s, &obj_key);
- if (!inode) {
+ if (IS_ERR_OR_NULL(inode)) {
/*
* the unlink almost completed, it just did not
* manage to remove "save" link and release objectid
@@ -626,7 +626,8 @@ static int __init init_inodecache(void)
sizeof(struct
reiserfs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT),
init_once);
if (reiserfs_inode_cachep == NULL)
return -ENOMEM;
@@ -801,6 +802,7 @@ static const struct dquot_operations reiserfs_quota_operations = {
.write_info = reiserfs_write_info,
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
+ .get_next_id = dquot_get_next_id,
};
static const struct quotactl_ops reiserfs_qctl_operations = {
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index f41e19b4bb42..2d5489b0a269 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -151,7 +151,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
*/
if (up_to_date_bh) {
unsigned pgoff =
- (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
+ (tail_offset + total_tail - 1) & (PAGE_SIZE - 1);
char *kaddr = kmap_atomic(up_to_date_bh->b_page);
memset(kaddr + pgoff, 0, blk_size - total_tail);
kunmap_atomic(kaddr);
@@ -271,7 +271,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
* the page was locked and this part of the page was up to date when
* indirect2direct was called, so we know the bytes are still valid
*/
- tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
+ tail = tail + (pos & (PAGE_SIZE - 1));
PATH_LAST_POSITION(path)++;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e87f9b52bf06..28f5f8b11370 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -64,14 +64,14 @@
#ifdef CONFIG_REISERFS_FS_XATTR
static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
{
- BUG_ON(!mutex_is_locked(&dir->i_mutex));
+ BUG_ON(!inode_is_locked(dir));
return dir->i_op->create(dir, dentry, mode, true);
}
#endif
static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
- BUG_ON(!mutex_is_locked(&dir->i_mutex));
+ BUG_ON(!inode_is_locked(dir));
return dir->i_op->mkdir(dir, dentry, mode);
}
@@ -85,11 +85,11 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
{
int error;
- BUG_ON(!mutex_is_locked(&dir->i_mutex));
+ BUG_ON(!inode_is_locked(dir));
- mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
error = dir->i_op->unlink(dir, dentry);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
if (!error)
d_delete(dentry);
@@ -100,13 +100,13 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
{
int error;
- BUG_ON(!mutex_is_locked(&dir->i_mutex));
+ BUG_ON(!inode_is_locked(dir));
- mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
error = dir->i_op->rmdir(dir, dentry);
if (!error)
d_inode(dentry)->i_flags |= S_DEAD;
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
if (!error)
d_delete(dentry);
@@ -123,7 +123,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
if (d_really_is_negative(privroot))
return ERR_PTR(-ENODATA);
- mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
xaroot = dget(REISERFS_SB(sb)->xattr_root);
if (!xaroot)
@@ -139,7 +139,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
}
}
- mutex_unlock(&d_inode(privroot)->i_mutex);
+ inode_unlock(d_inode(privroot));
return xaroot;
}
@@ -156,7 +156,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
le32_to_cpu(INODE_PKEY(inode)->k_objectid),
inode->i_generation);
- mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR);
xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
@@ -170,7 +170,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
}
}
- mutex_unlock(&d_inode(xaroot)->i_mutex);
+ inode_unlock(d_inode(xaroot));
dput(xaroot);
return xadir;
}
@@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
container_of(ctx, struct reiserfs_dentry_buf, ctx);
struct dentry *dentry;
- WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex));
+ WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
return -ENOSPC;
@@ -254,7 +254,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
goto out_dir;
}
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
buf.xadir = dir;
while (1) {
@@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
break;
buf.count = 0;
}
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
cleanup_dentry_buf(&buf);
@@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
if (!err) {
int jerror;
- mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex,
+ inode_lock_nested(d_inode(dir->d_parent),
I_MUTEX_XATTR);
err = action(dir, data);
reiserfs_write_lock(inode->i_sb);
jerror = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
- mutex_unlock(&d_inode(dir->d_parent)->i_mutex);
+ inode_unlock(d_inode(dir->d_parent));
err = jerror ?: err;
}
}
@@ -384,7 +384,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
if (IS_ERR(xadir))
return ERR_CAST(xadir);
- mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
xafile = lookup_one_len(name, xadir, strlen(name));
if (IS_ERR(xafile)) {
err = PTR_ERR(xafile);
@@ -404,7 +404,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
if (err)
dput(xafile);
out:
- mutex_unlock(&d_inode(xadir)->i_mutex);
+ inode_unlock(d_inode(xadir));
dput(xadir);
if (err)
return ERR_PTR(err);
@@ -415,7 +415,7 @@ out:
static inline void reiserfs_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
static struct page *reiserfs_get_page(struct inode *dir, size_t n)
@@ -427,7 +427,7 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
* and an unlink/rmdir has just occurred - GFP_NOFS avoids this
*/
mapping_set_gfp_mask(mapping, GFP_NOFS);
- page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
+ page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL);
if (!IS_ERR(page)) {
kmap(page);
if (PageError(page))
@@ -469,7 +469,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
if (IS_ERR(xadir))
return PTR_ERR(xadir);
- mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
dentry = lookup_one_len(name, xadir, strlen(name));
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
@@ -483,7 +483,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
dput(dentry);
out_dput:
- mutex_unlock(&d_inode(xadir)->i_mutex);
+ inode_unlock(d_inode(xadir));
dput(xadir);
return err;
}
@@ -526,10 +526,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
while (buffer_pos < buffer_size || buffer_pos == 0) {
size_t chunk;
size_t skip = 0;
- size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
+ size_t page_offset = (file_pos & (PAGE_SIZE - 1));
- if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
- chunk = PAGE_CACHE_SIZE;
+ if (buffer_size - buffer_pos > PAGE_SIZE)
+ chunk = PAGE_SIZE;
else
chunk = buffer_size - buffer_pos;
@@ -546,8 +546,8 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
struct reiserfs_xattr_header *rxh;
skip = file_pos = sizeof(struct reiserfs_xattr_header);
- if (chunk + skip > PAGE_CACHE_SIZE)
- chunk = PAGE_CACHE_SIZE - skip;
+ if (chunk + skip > PAGE_SIZE)
+ chunk = PAGE_SIZE - skip;
rxh = (struct reiserfs_xattr_header *)data;
rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
rxh->h_hash = cpu_to_le32(xahash);
@@ -580,11 +580,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
.ia_valid = ATTR_SIZE | ATTR_CTIME,
};
- mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
inode_dio_wait(d_inode(dentry));
err = reiserfs_setattr(dentry, &newattrs);
- mutex_unlock(&d_inode(dentry)->i_mutex);
+ inode_unlock(d_inode(dentry));
} else
update_ctime(inode);
out_unlock:
@@ -675,8 +675,8 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
char *data;
size_t skip = 0;
- if (isize - file_pos > PAGE_CACHE_SIZE)
- chunk = PAGE_CACHE_SIZE;
+ if (isize - file_pos > PAGE_SIZE)
+ chunk = PAGE_SIZE;
else
chunk = isize - file_pos;
@@ -756,7 +756,8 @@ find_xattr_handler_prefix(const struct xattr_handler **handlers,
return NULL;
for_each_xattr_handler(handlers, xah) {
- if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
+ const char *prefix = xattr_prefix(xah);
+ if (strncmp(prefix, name, strlen(prefix)) == 0)
break;
}
@@ -778,7 +779,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->get(dentry, name, buffer, size, handler->flags);
+ return handler->get(handler, dentry, name, buffer, size);
}
/*
@@ -797,7 +798,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->set(dentry, name, value, size, flags, handler->flags);
+ return handler->set(handler, dentry, name, value, size, flags);
}
/*
@@ -814,7 +815,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
+ return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
}
struct listxattr_buf {
@@ -839,19 +840,16 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
name);
- if (!handler) /* Unsupported xattr name */
+ if (!handler /* Unsupported xattr name */ ||
+ (handler->list && !handler->list(b->dentry)))
return 0;
+ size = namelen + 1;
if (b->buf) {
- size = handler->list(b->dentry, b->buf + b->pos,
- b->size, name, namelen,
- handler->flags);
if (size > b->size)
return -ERANGE;
- } else {
- size = handler->list(b->dentry, NULL, 0, name,
- namelen, handler->flags);
+ memcpy(b->buf + b->pos, name, namelen);
+ b->buf[b->pos + namelen] = 0;
}
-
b->pos += size;
}
return 0;
@@ -890,9 +888,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
goto out;
}
- mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+ inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
- mutex_unlock(&d_inode(dir)->i_mutex);
+ inode_unlock(d_inode(dir));
if (!err)
err = buf.pos;
@@ -907,7 +905,7 @@ static int create_privroot(struct dentry *dentry)
int err;
struct inode *inode = d_inode(dentry->d_parent);
- WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+ WARN_ON_ONCE(!inode_is_locked(inode));
err = xattr_mkdir(inode, dentry, 0700);
if (err || d_really_is_negative(dentry)) {
@@ -997,7 +995,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
int err = 0;
/* If we don't have the privroot located yet - go find it */
- mutex_lock(&d_inode(s->s_root)->i_mutex);
+ inode_lock(d_inode(s->s_root));
dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
strlen(PRIVROOT_NAME));
if (!IS_ERR(dentry)) {
@@ -1007,7 +1005,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
d_inode(dentry)->i_flags |= S_PRIVATE;
} else
err = PTR_ERR(dentry);
- mutex_unlock(&d_inode(s->s_root)->i_mutex);
+ inode_unlock(d_inode(s->s_root));
return err;
}
@@ -1027,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
goto error;
if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
- mutex_lock(&d_inode(s->s_root)->i_mutex);
+ inode_lock(d_inode(s->s_root));
err = create_privroot(REISERFS_SB(s)->priv_root);
- mutex_unlock(&d_inode(s->s_root)->i_mutex);
+ inode_unlock(d_inode(s->s_root));
}
if (d_really_is_positive(privroot)) {
s->s_xattr = reiserfs_xattr_handlers;
- mutex_lock(&d_inode(privroot)->i_mutex);
+ inode_lock(d_inode(privroot));
if (!REISERFS_SB(s)->xattr_root) {
struct dentry *dentry;
@@ -1045,7 +1043,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
else
err = PTR_ERR(dentry);
}
- mutex_unlock(&d_inode(privroot)->i_mutex);
+ inode_unlock(d_inode(privroot));
}
error:
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 4b34b9dc03dd..558a16beaacb 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -186,10 +186,10 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
break;
default:
BUG();
@@ -244,7 +244,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
switch (type) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
if (acl) {
error = posix_acl_equiv_mode(acl, &inode->i_mode);
if (error < 0)
@@ -256,7 +256,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
}
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
if (!S_ISDIR(inode->i_mode))
return acl ? -EACCES : 0;
break;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 9a3b0616f283..ab0217d32039 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -9,8 +9,8 @@
#include <linux/uaccess.h>
static int
-security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+security_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
return -EINVAL;
@@ -22,8 +22,8 @@ security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-security_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+security_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
return -EINVAL;
@@ -34,20 +34,9 @@ security_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t namelen, int handler_flags)
+static bool security_list(struct dentry *dentry)
{
- const size_t len = namelen + 1;
-
- if (IS_PRIVATE(d_inode(dentry)))
- return 0;
-
- if (list && len <= list_len) {
- memcpy(list, name, namelen);
- list[namelen] = '\0';
- }
-
- return len;
+ return !IS_PRIVATE(d_inode(dentry));
}
/* Initializes the security context for a new inode and returns the number
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index e4f1343714e0..64b67aa643a9 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -8,8 +8,8 @@
#include <linux/uaccess.h>
static int
-trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+trusted_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
return -EINVAL;
@@ -21,8 +21,8 @@ trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-trusted_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
return -EINVAL;
@@ -33,19 +33,9 @@ trusted_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int handler_flags)
+static bool trusted_list(struct dentry *dentry)
{
- const size_t len = name_len + 1;
-
- if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
- return 0;
-
- if (list && len <= list_size) {
- memcpy(list, name, name_len);
- list[name_len] = '\0';
- }
- return len;
+ return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
}
const struct xattr_handler reiserfs_xattr_trusted_handler = {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index d0b08d3e5689..12e6306f562a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -7,8 +7,8 @@
#include <linux/uaccess.h>
static int
-user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+user_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_USER_PREFIX))
@@ -19,8 +19,8 @@ user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-user_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+user_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_USER_PREFIX))
return -EINVAL;
@@ -30,18 +30,9 @@ user_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int handler_flags)
+static bool user_list(struct dentry *dentry)
{
- const size_t len = name_len + 1;
-
- if (!reiserfs_xattrs_user(dentry->d_sb))
- return 0;
- if (list && len <= list_size) {
- memcpy(list, name, name_len);
- list[name_len] = '\0';
- }
- return len;
+ return reiserfs_xattrs_user(dentry->d_sb);
}
const struct xattr_handler reiserfs_xattr_user_handler = {
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 268733cda397..6b00ca357c58 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -360,6 +360,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
break;
case ROMFH_SYM:
i->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(i);
i->i_data.a_ops = &romfs_aops;
mode |= S_IRWXUGO;
break;
@@ -618,8 +619,8 @@ static int __init init_romfs_fs(void)
romfs_inode_cachep =
kmem_cache_create("romfs_i",
sizeof(struct romfs_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
- romfs_i_init_once);
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT, romfs_i_init_once);
if (!romfs_inode_cachep) {
pr_err("Failed to initialise inode cache\n");
diff --git a/fs/select.c b/fs/select.c
index 015547330e88..869293988c2a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -70,9 +70,9 @@ static long __estimate_accuracy(struct timespec *tv)
return slack;
}
-long select_estimate_accuracy(struct timespec *tv)
+u64 select_estimate_accuracy(struct timespec *tv)
{
- unsigned long ret;
+ u64 ret;
struct timespec now;
/*
@@ -402,7 +402,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
struct poll_wqueues table;
poll_table *wait;
int retval, i, timed_out = 0;
- unsigned long slack = 0;
+ u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_end = 0;
@@ -778,13 +778,13 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
return mask;
}
-static int do_poll(unsigned int nfds, struct poll_list *list,
- struct poll_wqueues *wait, struct timespec *end_time)
+static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
+ struct timespec *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
- unsigned long slack = 0;
+ u64 slack = 0;
unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_end = 0;
@@ -908,7 +908,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
}
poll_initwait(&table);
- fdcount = do_poll(nfds, head, &table, end_time);
+ fdcount = do_poll(head, &table, end_time);
poll_freewait(&table);
for (walk = head; walk; walk = walk->next) {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 225586e141ca..19f532e7d35e 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -13,6 +13,7 @@
#include <linux/cred.h>
#include <linux/mm.h>
#include <linux/printk.h>
+#include <linux/string_helpers.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -25,12 +26,17 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
void *buf;
+ gfp_t gfp = GFP_KERNEL;
/*
- * __GFP_NORETRY to avoid oom-killings with high-order allocations -
- * it's better to fall back to vmalloc() than to kill things.
+ * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
+ * it's better to fall back to vmalloc() than to kill things. For small
+ * allocations, just use GFP_KERNEL which will oom kill, thus no need
+ * for vmalloc fallback.
*/
- buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+ if (size > PAGE_SIZE)
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ buf = kmalloc(size, gfp);
if (!buf && size > PAGE_SIZE)
buf = vmalloc(size);
return buf;
@@ -66,9 +72,10 @@ int seq_open(struct file *file, const struct seq_operations *op)
mutex_init(&p->lock);
p->op = op;
-#ifdef CONFIG_USER_NS
- p->user_ns = file->f_cred->user_ns;
-#endif
+
+ // No refcounting: the lifetime of 'p' is constrained
+ // to the lifetime of the file.
+ p->file = file;
/*
* Wrappers around seq_open(e.g. swaps_open) need to be
@@ -377,26 +384,12 @@ EXPORT_SYMBOL(seq_release);
*/
void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
- char *end = m->buf + m->size;
- char *p;
- char c;
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int ret;
- for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
- if (!strchr(esc, c)) {
- *p++ = c;
- continue;
- }
- if (p + 3 < end) {
- *p++ = '\\';
- *p++ = '0' + ((c & 0300) >> 6);
- *p++ = '0' + ((c & 070) >> 3);
- *p++ = '0' + (c & 07);
- continue;
- }
- seq_set_overflow(m);
- return;
- }
- m->count = p - m->buf;
+ ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
+ seq_commit(m, ret < size ? ret : -1);
}
EXPORT_SYMBOL(seq_escape);
@@ -773,6 +766,8 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
{
const u8 *ptr = buf;
int i, linelen, remaining = len;
+ char *buffer;
+ size_t size;
int ret;
if (rowsize != 16 && rowsize != 32)
@@ -794,15 +789,12 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
break;
}
+ size = seq_get_buf(m, &buffer);
ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
- m->buf + m->count, m->size - m->count,
- ascii);
- if (ret >= m->size - m->count) {
- seq_set_overflow(m);
- } else {
- m->count += ret;
- seq_putc(m, '\n');
- }
+ buffer, size, ascii);
+ seq_commit(m, ret < size ? ret : -1);
+
+ seq_putc(m, '\n');
}
}
EXPORT_SYMBOL(seq_hex_dump);
diff --git a/fs/splice.c b/fs/splice.c
index 5fc1e50a7f30..b018eb485019 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -88,7 +88,7 @@ out_unlock:
static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- page_cache_release(buf->page);
+ put_page(buf->page);
buf->flags &= ~PIPE_BUF_FLAG_LRU;
}
@@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
unsigned int spd_pages = spd->nr_pages;
int ret, do_wakeup, page_nr;
+ if (!spd_pages)
+ return 0;
+
ret = 0;
do_wakeup = 0;
page_nr = 0;
@@ -265,7 +268,7 @@ EXPORT_SYMBOL_GPL(splice_to_pipe);
void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
{
- page_cache_release(spd->pages[i]);
+ put_page(spd->pages[i]);
}
/*
@@ -325,9 +328,9 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
if (splice_grow_spd(pipe, &spd))
return -ENOMEM;
- index = *ppos >> PAGE_CACHE_SHIFT;
- loff = *ppos & ~PAGE_CACHE_MASK;
- req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ index = *ppos >> PAGE_SHIFT;
+ loff = *ppos & ~PAGE_MASK;
+ req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT;
nr_pages = min(req_pages, spd.nr_pages_max);
/*
@@ -360,9 +363,9 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
break;
error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(error)) {
- page_cache_release(page);
+ put_page(page);
if (error == -EEXIST)
continue;
break;
@@ -382,7 +385,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
* Now loop over the map and see if we need to start IO on any
* pages, fill in the partial map, etc.
*/
- index = *ppos >> PAGE_CACHE_SHIFT;
+ index = *ppos >> PAGE_SHIFT;
nr_pages = spd.nr_pages;
spd.nr_pages = 0;
for (page_nr = 0; page_nr < nr_pages; page_nr++) {
@@ -394,7 +397,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
/*
* this_len is the max we'll use from this page
*/
- this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+ this_len = min_t(unsigned long, len, PAGE_SIZE - loff);
page = spd.pages[page_nr];
if (PageReadahead(page))
@@ -415,6 +418,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
*/
if (!page->mapping) {
unlock_page(page);
+retry_lookup:
page = find_or_create_page(mapping, index,
mapping_gfp_mask(mapping));
@@ -422,7 +426,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
error = -ENOMEM;
break;
}
- page_cache_release(spd.pages[page_nr]);
+ put_page(spd.pages[page_nr]);
spd.pages[page_nr] = page;
}
/*
@@ -439,13 +443,10 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
error = mapping->a_ops->readpage(in, page);
if (unlikely(error)) {
/*
- * We really should re-lookup the page here,
- * but it complicates things a lot. Instead
- * lets just do what we already stored, and
- * we'll get it the next time we are called.
+ * Re-lookup the page
*/
if (error == AOP_TRUNCATED_PAGE)
- error = 0;
+ goto retry_lookup;
break;
}
@@ -455,7 +456,7 @@ fill_it:
* i_size must be checked after PageUptodate.
*/
isize = i_size_read(mapping->host);
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (isize - 1) >> PAGE_SHIFT;
if (unlikely(!isize || index > end_index))
break;
@@ -469,7 +470,7 @@ fill_it:
/*
* max good bytes in this page
*/
- plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ plen = ((isize - 1) & ~PAGE_MASK) + 1;
if (plen <= loff)
break;
@@ -493,8 +494,8 @@ fill_it:
* we got, 'nr_pages' is how many pages are in the map.
*/
while (page_nr < nr_pages)
- page_cache_release(spd.pages[page_nr++]);
- in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+ put_page(spd.pages[page_nr++]);
+ in->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
if (spd.nr_pages)
error = splice_to_pipe(pipe, &spd);
@@ -579,7 +580,7 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
- res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+ res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
set_fs(old_fs);
return res;
@@ -635,8 +636,8 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
goto shrink_ret;
}
- offset = *ppos & ~PAGE_CACHE_MASK;
- nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ offset = *ppos & ~PAGE_MASK;
+ nr_pages = (len + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
struct page *page;
@@ -646,7 +647,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
if (!page)
goto err;
- this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+ this_len = min_t(size_t, len, PAGE_SIZE - offset);
vec[i].iov_base = (void __user *) page_address(page);
vec[i].iov_len = this_len;
spd.pages[i] = page;
@@ -809,6 +810,13 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
*/
static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
+ /*
+ * Check for signal early to make process killable when there are
+ * always buffers available
+ */
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
while (!pipe->nrbufs) {
if (!pipe->writers)
return 0;
@@ -884,6 +892,7 @@ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
splice_from_pipe_begin(sd);
do {
+ cond_resched();
ret = splice_from_pipe_next(pipe, sd);
if (ret > 0)
ret = splice_from_pipe_feed(pipe, sd, actor);
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 0cea9b9236d0..2c2618410d51 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -181,11 +181,11 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
in = min(bytes, msblk->devblksize - offset);
bytes -= in;
while (in) {
- if (pg_offset == PAGE_CACHE_SIZE) {
+ if (pg_offset == PAGE_SIZE) {
data = squashfs_next_page(output);
pg_offset = 0;
}
- avail = min_t(int, in, PAGE_CACHE_SIZE -
+ avail = min_t(int, in, PAGE_SIZE -
pg_offset);
memcpy(data + pg_offset, bh[k]->b_data + offset,
avail);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 1cb70a0b2168..23813c078cc9 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -30,7 +30,7 @@
* access the metadata and fragment caches.
*
* To avoid out of memory and fragmentation issues with vmalloc the cache
- * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
+ * uses sequences of kmalloced PAGE_SIZE buffers.
*
* It should be noted that the cache is not used for file datablocks, these
* are decompressed and cached in the page-cache in the normal way. The
@@ -231,7 +231,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
/*
* Initialise cache allocating the specified number of entries, each of
* size block_size. To avoid vmalloc fragmentation issues each entry
- * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
+ * is allocated as a sequence of kmalloced PAGE_SIZE buffers.
*/
struct squashfs_cache *squashfs_cache_init(char *name, int entries,
int block_size)
@@ -255,7 +255,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
cache->unused = entries;
cache->entries = entries;
cache->block_size = block_size;
- cache->pages = block_size >> PAGE_CACHE_SHIFT;
+ cache->pages = block_size >> PAGE_SHIFT;
cache->pages = cache->pages ? cache->pages : 1;
cache->name = name;
cache->num_waiters = 0;
@@ -275,7 +275,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
}
for (j = 0; j < cache->pages; j++) {
- entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ entry->data[j] = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (entry->data[j] == NULL) {
ERROR("Failed to allocate %s buffer\n", name);
goto cleanup;
@@ -314,10 +314,10 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
return min(length, entry->length - offset);
while (offset < entry->length) {
- void *buff = entry->data[offset / PAGE_CACHE_SIZE]
- + (offset % PAGE_CACHE_SIZE);
+ void *buff = entry->data[offset / PAGE_SIZE]
+ + (offset % PAGE_SIZE);
int bytes = min_t(int, entry->length - offset,
- PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
+ PAGE_SIZE - (offset % PAGE_SIZE));
if (bytes >= remaining) {
memcpy(buffer, buff, remaining);
@@ -415,7 +415,7 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
*/
void *squashfs_read_table(struct super_block *sb, u64 block, int length)
{
- int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ int pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
int i, res;
void *table, *buffer, **data;
struct squashfs_page_actor *actor;
@@ -436,7 +436,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
goto failed2;
}
- for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
+ for (i = 0; i < pages; i++, buffer += PAGE_SIZE)
data[i] = buffer;
res = squashfs_read_data(sb, block, length |
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index e9034bf6e5ae..d2bc13636f79 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -102,7 +102,7 @@ static void *get_comp_opts(struct super_block *sb, unsigned short flags)
* Read decompressor specific options from file system if present
*/
if (SQUASHFS_COMP_OPTS(flags)) {
- buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (buffer == NULL) {
comp_opts = ERR_PTR(-ENOMEM);
goto out;
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index e5c9689062ba..13d80947bf9e 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -175,7 +175,7 @@ static long long read_indexes(struct super_block *sb, int n,
{
int err, i;
long long block = 0;
- __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ __le32 *blist = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (blist == NULL) {
ERROR("read_indexes: Failed to allocate block_list\n");
@@ -183,7 +183,7 @@ static long long read_indexes(struct super_block *sb, int n,
}
while (n) {
- int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
+ int blocks = min_t(int, n, PAGE_SIZE >> 2);
err = squashfs_read_metadata(sb, blist, start_block,
offset, blocks << 2);
@@ -377,19 +377,19 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
struct inode *inode = page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
void *pageaddr;
- int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+ int i, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
int start_index = page->index & ~mask, end_index = start_index | mask;
/*
* Loop copying datablock into pages. As the datablock likely covers
- * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
+ * many PAGE_SIZE pages (default block size is 128 KiB) explicitly
* grab the pages from the page cache, except for the page that we've
* been called to fill.
*/
for (i = start_index; i <= end_index && bytes > 0; i++,
- bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+ bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
struct page *push_page;
- int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0;
+ int avail = buffer ? min_t(int, bytes, PAGE_SIZE) : 0;
TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
@@ -404,14 +404,14 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
pageaddr = kmap_atomic(push_page);
squashfs_copy_data(pageaddr, buffer, offset, avail);
- memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+ memset(pageaddr + avail, 0, PAGE_SIZE - avail);
kunmap_atomic(pageaddr);
flush_dcache_page(push_page);
SetPageUptodate(push_page);
skip_page:
unlock_page(push_page);
if (i != page->index)
- page_cache_release(push_page);
+ put_page(push_page);
}
}
@@ -454,7 +454,7 @@ static int squashfs_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
- int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+ int index = page->index >> (msblk->block_log - PAGE_SHIFT);
int file_end = i_size_read(inode) >> msblk->block_log;
int res;
void *pageaddr;
@@ -462,8 +462,8 @@ static int squashfs_readpage(struct file *file, struct page *page)
TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
page->index, squashfs_i(inode)->start);
- if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT))
+ if (page->index >= ((i_size_read(inode) + PAGE_SIZE - 1) >>
+ PAGE_SHIFT))
goto out;
if (index < file_end || squashfs_i(inode)->fragment_block ==
@@ -487,7 +487,7 @@ error_out:
SetPageError(page);
out:
pageaddr = kmap_atomic(page);
- memset(pageaddr, 0, PAGE_CACHE_SIZE);
+ memset(pageaddr, 0, PAGE_SIZE);
kunmap_atomic(pageaddr);
flush_dcache_page(page);
if (!PageError(page))
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 43e7a7eddac0..cb485d8e0e91 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -30,8 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
struct inode *inode = target_page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
- int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
- int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+ int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+ int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
int start_index = target_page->index & ~mask;
int end_index = start_index | mask;
int i, n, pages, missing_pages, bytes, res = -ENOMEM;
@@ -68,7 +68,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
if (PageUptodate(page[i])) {
unlock_page(page[i]);
- page_cache_release(page[i]);
+ put_page(page[i]);
page[i] = NULL;
missing_pages++;
}
@@ -96,10 +96,10 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
goto mark_errored;
/* Last page may have trailing bytes not filled */
- bytes = res % PAGE_CACHE_SIZE;
+ bytes = res % PAGE_SIZE;
if (bytes) {
pageaddr = kmap_atomic(page[pages - 1]);
- memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
+ memset(pageaddr + bytes, 0, PAGE_SIZE - bytes);
kunmap_atomic(pageaddr);
}
@@ -109,7 +109,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
SetPageUptodate(page[i]);
unlock_page(page[i]);
if (page[i] != target_page)
- page_cache_release(page[i]);
+ put_page(page[i]);
}
kfree(actor);
@@ -127,7 +127,7 @@ mark_errored:
flush_dcache_page(page[i]);
SetPageError(page[i]);
unlock_page(page[i]);
- page_cache_release(page[i]);
+ put_page(page[i]);
}
out:
@@ -153,21 +153,21 @@ static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
}
for (n = 0; n < pages && bytes > 0; n++,
- bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
- int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+ bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
+ int avail = min_t(int, bytes, PAGE_SIZE);
if (page[n] == NULL)
continue;
pageaddr = kmap_atomic(page[n]);
squashfs_copy_data(pageaddr, buffer, offset, avail);
- memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+ memset(pageaddr + avail, 0, PAGE_SIZE - avail);
kunmap_atomic(pageaddr);
flush_dcache_page(page[n]);
SetPageUptodate(page[n]);
unlock_page(page[n]);
if (page[n] != target_page)
- page_cache_release(page[n]);
+ put_page(page[n]);
}
out:
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index a1ce5ce60632..0927b1e80ab6 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -41,6 +41,7 @@
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/xattr.h>
+#include <linux/pagemap.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
@@ -291,6 +292,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
inode->i_op = &squashfs_symlink_inode_ops;
+ inode_nohighmem(inode);
inode->i_data.a_ops = &squashfs_symlink_aops;
inode->i_mode |= S_IFLNK;
squashfs_i(inode)->start = block;
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index c31e2bc9c081..ff4468bd18b0 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -117,13 +117,13 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
data = squashfs_first_page(output);
buff = stream->output;
while (data) {
- if (bytes <= PAGE_CACHE_SIZE) {
+ if (bytes <= PAGE_SIZE) {
memcpy(data, buff, bytes);
break;
}
- memcpy(data, buff, PAGE_CACHE_SIZE);
- buff += PAGE_CACHE_SIZE;
- bytes -= PAGE_CACHE_SIZE;
+ memcpy(data, buff, PAGE_SIZE);
+ buff += PAGE_SIZE;
+ bytes -= PAGE_SIZE;
data = squashfs_next_page(output);
}
squashfs_finish_page(output);
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 244b9fbfff7b..934c17e96590 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -102,13 +102,13 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
data = squashfs_first_page(output);
buff = stream->output;
while (data) {
- if (bytes <= PAGE_CACHE_SIZE) {
+ if (bytes <= PAGE_SIZE) {
memcpy(data, buff, bytes);
break;
} else {
- memcpy(data, buff, PAGE_CACHE_SIZE);
- buff += PAGE_CACHE_SIZE;
- bytes -= PAGE_CACHE_SIZE;
+ memcpy(data, buff, PAGE_SIZE);
+ buff += PAGE_SIZE;
+ bytes -= PAGE_SIZE;
data = squashfs_next_page(output);
}
}
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 5a1c11f56441..9b7b1b6a7892 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -48,7 +48,7 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
if (actor == NULL)
return NULL;
- actor->length = length ? : pages * PAGE_CACHE_SIZE;
+ actor->length = length ? : pages * PAGE_SIZE;
actor->buffer = buffer;
actor->pages = pages;
actor->next_page = 0;
@@ -88,7 +88,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
if (actor == NULL)
return NULL;
- actor->length = length ? : pages * PAGE_CACHE_SIZE;
+ actor->length = length ? : pages * PAGE_SIZE;
actor->page = page;
actor->pages = pages;
actor->next_page = 0;
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 26dd82008b82..98537eab27e2 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -24,7 +24,7 @@ static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
if (actor == NULL)
return NULL;
- actor->length = length ? : pages * PAGE_CACHE_SIZE;
+ actor->length = length ? : pages * PAGE_SIZE;
actor->page = page;
actor->pages = pages;
actor->next_page = 0;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5056babe00df..cf01e15a7b16 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -80,7 +80,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct squashfs_sb_info *msblk;
struct squashfs_super_block *sblk = NULL;
- char b[BDEVNAME_SIZE];
struct inode *root;
long long root_inode;
unsigned short flags;
@@ -124,8 +123,8 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = le32_to_cpu(sblk->s_magic);
if (sb->s_magic != SQUASHFS_MAGIC) {
if (!silent)
- ERROR("Can't find a SQUASHFS superblock on %s\n",
- bdevname(sb->s_bdev, b));
+ ERROR("Can't find a SQUASHFS superblock on %pg\n",
+ sb->s_bdev);
goto failed_mount;
}
@@ -153,7 +152,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
* Check the system page size is not larger than the filesystem
* block size (by default 128K). This is currently not supported.
*/
- if (PAGE_CACHE_SIZE > msblk->block_size) {
+ if (PAGE_SIZE > msblk->block_size) {
ERROR("Page size > filesystem block size (%d). This is "
"currently not supported!\n", msblk->block_size);
goto failed_mount;
@@ -178,7 +177,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
msblk->inodes = le32_to_cpu(sblk->inodes);
flags = le16_to_cpu(sblk->flags);
- TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
+ TRACE("Found valid superblock on %pg\n", sb->s_bdev);
TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
? "un" : "");
TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
@@ -420,7 +419,8 @@ static int __init init_inodecache(void)
{
squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
sizeof(struct squashfs_inode_info), 0,
- SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+ init_once);
return squashfs_inode_cachep ? 0 : -ENOMEM;
}
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 12806dffb345..d688ef42a6a1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -48,10 +48,10 @@ static int squashfs_symlink_readpage(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
struct squashfs_sb_info *msblk = sb->s_fs_info;
- int index = page->index << PAGE_CACHE_SHIFT;
+ int index = page->index << PAGE_SHIFT;
u64 block = squashfs_i(inode)->start;
int offset = squashfs_i(inode)->offset;
- int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
+ int length = min_t(int, i_size_read(inode) - index, PAGE_SIZE);
int bytes, copied;
void *pageaddr;
struct squashfs_cache_entry *entry;
@@ -94,7 +94,7 @@ static int squashfs_symlink_readpage(struct file *file, struct page *page)
copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
length - bytes);
if (copied == length - bytes)
- memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
+ memset(pageaddr + length, 0, PAGE_SIZE - length);
else
block = entry->next_index;
kunmap_atomic(pageaddr);
@@ -119,8 +119,7 @@ const struct address_space_operations squashfs_symlink_aops = {
const struct inode_operations squashfs_symlink_inode_ops = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getxattr = generic_getxattr,
.listxattr = squashfs_listxattr
};
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index e5e0ddf5b143..1e9de96288d8 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -58,7 +58,7 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
struct squashfs_xattr_entry entry;
struct squashfs_xattr_val val;
const struct xattr_handler *handler;
- int name_size, prefix_size = 0;
+ int name_size;
err = squashfs_read_metadata(sb, &entry, &start, &offset,
sizeof(entry));
@@ -67,15 +67,16 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
name_size = le16_to_cpu(entry.size);
handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
- if (handler)
- prefix_size = handler->list(d, buffer, rest, NULL,
- name_size, handler->flags);
- if (prefix_size) {
+ if (handler && (!handler->list || handler->list(d))) {
+ const char *prefix = handler->prefix ?: handler->name;
+ size_t prefix_size = strlen(prefix);
+
if (buffer) {
if (prefix_size + name_size + 1 > rest) {
err = -ERANGE;
goto failed;
}
+ memcpy(buffer, prefix, prefix_size);
buffer += prefix_size;
}
err = squashfs_read_metadata(sb, buffer, &start,
@@ -212,88 +213,45 @@ failed:
}
-/*
- * User namespace support
- */
-static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- if (list && XATTR_USER_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
- return XATTR_USER_PREFIX_LEN;
-}
-
-static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
- size_t size, int type)
+static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *d, const char *name,
+ void *buffer, size_t size)
{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name,
+ return squashfs_xattr_get(d_inode(d), handler->flags, name,
buffer, size);
}
+/*
+ * User namespace support
+ */
static const struct xattr_handler squashfs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = squashfs_user_list,
- .get = squashfs_user_get
+ .flags = SQUASHFS_XATTR_USER,
+ .get = squashfs_xattr_handler_get
};
/*
* Trusted namespace support
*/
-static size_t squashfs_trusted_list(struct dentry *d, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+static bool squashfs_trusted_xattr_handler_list(struct dentry *d)
{
- if (!capable(CAP_SYS_ADMIN))
- return 0;
-
- if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
- return XATTR_TRUSTED_PREFIX_LEN;
-}
-
-static int squashfs_trusted_get(struct dentry *d, const char *name,
- void *buffer, size_t size, int type)
-{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name,
- buffer, size);
+ return capable(CAP_SYS_ADMIN);
}
static const struct xattr_handler squashfs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = squashfs_trusted_list,
- .get = squashfs_trusted_get
+ .flags = SQUASHFS_XATTR_TRUSTED,
+ .list = squashfs_trusted_xattr_handler_list,
+ .get = squashfs_xattr_handler_get
};
/*
* Security namespace support
*/
-static size_t squashfs_security_list(struct dentry *d, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
- return XATTR_SECURITY_PREFIX_LEN;
-}
-
-static int squashfs_security_get(struct dentry *d, const char *name,
- void *buffer, size_t size, int type)
-{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name,
- buffer, size);
-}
-
static const struct xattr_handler squashfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = squashfs_security_list,
- .get = squashfs_security_get
+ .flags = SQUASHFS_XATTR_SECURITY,
+ .get = squashfs_xattr_handler_get
};
static const struct xattr_handler *squashfs_xattr_handler(int type)
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index c609624e4b8a..6bfaef73d065 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -141,7 +141,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
stream->buf.in_pos = 0;
stream->buf.in_size = 0;
stream->buf.out_pos = 0;
- stream->buf.out_size = PAGE_CACHE_SIZE;
+ stream->buf.out_size = PAGE_SIZE;
stream->buf.out = squashfs_first_page(output);
do {
@@ -158,7 +158,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
stream->buf.out = squashfs_next_page(output);
if (stream->buf.out != NULL) {
stream->buf.out_pos = 0;
- total += PAGE_CACHE_SIZE;
+ total += PAGE_SIZE;
}
}
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 8727caba6882..2ec24d128bce 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -69,7 +69,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
int zlib_err, zlib_init = 0, k = 0;
z_stream *stream = strm;
- stream->avail_out = PAGE_CACHE_SIZE;
+ stream->avail_out = PAGE_SIZE;
stream->next_out = squashfs_first_page(output);
stream->avail_in = 0;
@@ -85,7 +85,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
if (stream->avail_out == 0) {
stream->next_out = squashfs_next_page(output);
if (stream->next_out != NULL)
- stream->avail_out = PAGE_CACHE_SIZE;
+ stream->avail_out = PAGE_SIZE;
}
if (!zlib_init) {
diff --git a/fs/stat.c b/fs/stat.c
index cccc1aab9a8b..bc045c7994e1 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x)
+#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
#ifndef INIT_STRUCT_STAT_PADDING
@@ -367,8 +367,6 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
INIT_STRUCT_STAT64_PADDING(tmp);
#ifdef CONFIG_MIPS
/* mips has weird padding, so we don't get 64 bits there */
- if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
- return -EOVERFLOW;
tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_rdev = new_encode_dev(stat->rdev);
#else
diff --git a/fs/super.c b/fs/super.c
index 6cd9f719cf61..d78b9847e6cb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
sb->s_flags &= ~MS_ACTIVE;
fsnotify_unmount_inodes(sb);
+ cgroup_writeback_umount();
evict_inodes(sb);
@@ -1012,10 +1013,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
blkdev_put(bdev, mode);
down_write(&s->s_umount);
} else {
- char b[BDEVNAME_SIZE];
-
s->s_mode = mode;
- strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+ snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
if (error) {
diff --git a/fs/sync.c b/fs/sync.c
index 4ec430ae2b0d..2a54c1f22035 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -302,7 +302,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
goto out;
if (sizeof(pgoff_t) == 4) {
- if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
+ if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
/*
* The range starts outside a 32 bit machine's
* pagecache addressing capabilities. Let it "succeed"
@@ -310,7 +310,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
ret = 0;
goto out;
}
- if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
+ if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
/*
* Out to EOF
*/
@@ -348,7 +348,8 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
}
if (flags & SYNC_FILE_RANGE_WRITE) {
- ret = filemap_fdatawrite_range(mapping, offset, endbyte);
+ ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
if (ret < 0)
goto out_put;
}
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index e1236594fffe..dc1358b5ec95 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -73,13 +73,26 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
}
if (grp->bin_attrs) {
- for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+ for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) {
+ umode_t mode = (*bin_attr)->attr.mode;
+
if (update)
kernfs_remove_by_name(parent,
(*bin_attr)->attr.name);
+ if (grp->is_bin_visible) {
+ mode = grp->is_bin_visible(kobj, *bin_attr, i);
+ if (!mode)
+ continue;
+ }
+
+ WARN(mode & ~(SYSFS_PREALLOC | 0664),
+ "Attribute %s: Invalid permissions 0%o\n",
+ (*bin_attr)->attr.name, mode);
+
+ mode &= SYSFS_PREALLOC | 0664;
error = sysfs_add_file_mode_ns(parent,
&(*bin_attr)->attr, true,
- (*bin_attr)->attr.mode, NULL);
+ mode, NULL);
if (error)
break;
}
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 63c1bcb224ee..c0f0a3e643eb 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -30,7 +30,7 @@ const struct file_operations sysv_dir_operations = {
static inline void dir_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
@@ -73,8 +73,8 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx)
if (pos >= inode->i_size)
return 0;
- offset = pos & ~PAGE_CACHE_MASK;
- n = pos >> PAGE_CACHE_SHIFT;
+ offset = pos & ~PAGE_MASK;
+ n = pos >> PAGE_SHIFT;
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
@@ -85,7 +85,7 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx)
continue;
kaddr = (char *)page_address(page);
de = (struct sysv_dir_entry *)(kaddr+offset);
- limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+ limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE;
for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
char *name = de->name;
@@ -146,7 +146,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
if (!IS_ERR(page)) {
kaddr = (char*)page_address(page);
de = (struct sysv_dir_entry *) kaddr;
- kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+ kaddr += PAGE_SIZE - SYSV_DIRSIZE;
for ( ; (char *) de <= kaddr ; de++) {
if (!de->inode)
continue;
@@ -190,7 +190,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode)
goto out;
kaddr = (char*)page_address(page);
de = (struct sysv_dir_entry *)kaddr;
- kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+ kaddr += PAGE_SIZE - SYSV_DIRSIZE;
while ((char *)de <= kaddr) {
if (!de->inode)
goto got_it;
@@ -261,7 +261,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir)
kmap(page);
base = (char*)page_address(page);
- memset(base, 0, PAGE_CACHE_SIZE);
+ memset(base, 0, PAGE_SIZE);
de = (struct sysv_dir_entry *) base;
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
@@ -273,7 +273,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir)
kunmap(page);
err = dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
@@ -296,7 +296,7 @@ int sysv_empty_dir(struct inode * inode)
kaddr = (char *)page_address(page);
de = (struct sysv_dir_entry *)kaddr;
- kaddr += PAGE_CACHE_SIZE-SYSV_DIRSIZE;
+ kaddr += PAGE_SIZE-SYSV_DIRSIZE;
for ( ;(char *)de <= kaddr; de++) {
if (!de->inode)
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 590ad9206e3f..d62c423a5a2d 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -146,8 +146,7 @@ static inline void write3byte(struct sysv_sb_info *sbi,
static const struct inode_operations sysv_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
+ .get_link = page_get_link,
.getattr = sysv_getattr,
};
@@ -162,15 +161,9 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
inode->i_fop = &sysv_dir_operations;
inode->i_mapping->a_ops = &sysv_aops;
} else if (S_ISLNK(inode->i_mode)) {
- if (inode->i_blocks) {
- inode->i_op = &sysv_symlink_inode_operations;
- inode->i_mapping->a_ops = &sysv_aops;
- } else {
- inode->i_op = &simple_symlink_inode_operations;
- inode->i_link = (char *)SYSV_I(inode)->i_data;
- nd_terminate_link(inode->i_link, inode->i_size,
- sizeof(SYSV_I(inode)->i_data) - 1);
- }
+ inode->i_op = &sysv_symlink_inode_operations;
+ inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &sysv_aops;
} else
init_special_inode(inode, inode->i_mode, rdev);
}
@@ -353,7 +346,7 @@ int __init sysv_init_icache(void)
{
sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
sizeof(struct sysv_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
init_once);
if (!sysv_inode_cachep)
return -ENOMEM;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 11e83ed0b4bf..90b60c03b588 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -264,11 +264,11 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
return err;
}
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b94fa6c3c6eb..053818dd6c18 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -153,7 +153,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
if (isalarm(ctx))
remaining = alarm_expires_remaining(&ctx->t.alarm);
else
- remaining = hrtimer_expires_remaining(&ctx->t.tmr);
+ remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr);
return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index c66f2423e1f5..4a0e48f92104 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -84,9 +84,9 @@ static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umo
* the files within the tracefs system. It is up to the individual
* mkdir routine to handle races.
*/
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
ret = tracefs_ops.mkdir(name);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
kfree(name);
@@ -109,13 +109,13 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
* This time we need to unlock not only the parent (inode) but
* also the directory that is being deleted.
*/
- mutex_unlock(&inode->i_mutex);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ inode_unlock(inode);
+ inode_unlock(dentry->d_inode);
ret = tracefs_ops.rmdir(name);
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock(&dentry->d_inode->i_mutex);
+ inode_lock_nested(inode, I_MUTEX_PARENT);
+ inode_lock(dentry->d_inode);
kfree(name);
@@ -334,7 +334,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = tracefs_mount->mnt_root;
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
dentry = lookup_one_len(name, parent, strlen(name));
if (!IS_ERR(dentry) && dentry->d_inode) {
dput(dentry);
@@ -342,7 +342,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
}
if (IS_ERR(dentry)) {
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
}
@@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
static struct dentry *failed_creating(struct dentry *dentry)
{
- mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ inode_unlock(dentry->d_parent->d_inode);
dput(dentry);
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
return NULL;
@@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
static struct dentry *end_creating(struct dentry *dentry)
{
- mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ inode_unlock(dentry->d_parent->d_inode);
return dentry;
}
@@ -544,9 +544,9 @@ void tracefs_remove(struct dentry *dentry)
if (!parent || !parent->d_inode)
return;
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
ret = __tracefs_remove(dentry, parent);
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
if (!ret)
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
}
@@ -572,7 +572,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
parent = dentry;
down:
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
loop:
/*
* The parent->d_subdirs is protected by the d_lock. Outside that
@@ -587,7 +587,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
/* perhaps simple_empty(child) makes more sense */
if (!list_empty(&child->d_subdirs)) {
spin_unlock(&parent->d_lock);
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
parent = child;
goto down;
}
@@ -608,10 +608,10 @@ void tracefs_remove_recursive(struct dentry *dentry)
}
spin_unlock(&parent->d_lock);
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
child = parent;
parent = parent->d_parent;
- mutex_lock(&parent->d_inode->i_mutex);
+ inode_lock(parent->d_inode);
if (child != dentry)
/* go up */
@@ -619,7 +619,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
if (!__tracefs_remove(child, parent))
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
- mutex_unlock(&parent->d_inode->i_mutex);
+ inode_unlock(parent->d_inode);
}
/**
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index ba66d508006a..7ff7712f284e 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -35,3 +35,18 @@ config UBIFS_FS_ZLIB
default y
help
Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
+
+config UBIFS_ATIME_SUPPORT
+ bool "Access time support" if UBIFS_FS
+ depends on UBIFS_FS
+ default n
+ help
+ Originally UBIFS did not support atime, because it looked like a bad idea due
+ increased flash wear. This option adds atime support and it is disabled by default
+ to preserve the old behavior. If you enable this option, UBIFS starts updating atime,
+ which means that file-system read operations will cause writes (inode atime
+ updates). This may affect file-system performance and increase flash device wear,
+ so be careful. How often atime is updated depends on the selected strategy:
+ strictatime is the "heavy", relatime is "lighter", etc.
+
+ If unsure, say 'N'
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index 2c6f0cb816b4..c54a24360f85 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -4,3 +4,4 @@ ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
+ubifs-y += misc.o
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4c46a9865fa7..595ca0debe11 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2573,7 +2573,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
{
int err, failing;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
failing = power_cut_emulated(c, lnum, 1);
@@ -2595,7 +2595,7 @@ int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf,
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 1))
return -EROFS;
@@ -2611,7 +2611,7 @@ int dbg_leb_unmap(struct ubifs_info *c, int lnum)
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 0))
return -EROFS;
@@ -2627,7 +2627,7 @@ int dbg_leb_map(struct ubifs_info *c, int lnum)
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 0))
return -EROFS;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5c27c66c224a..795992a8321e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -449,13 +449,14 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
}
out:
+ kfree(file->private_data);
+ file->private_data = NULL;
+
if (err != -ENOENT) {
ubifs_err(c, "cannot find next direntry, error %d", err);
return err;
}
- kfree(file->private_data);
- file->private_data = NULL;
/* 2 is a special value indicating that there are no more direntries */
ctx->pos = 2;
return 0;
@@ -514,8 +515,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu",
dentry, inode->i_ino,
inode->i_nlink, dir->i_ino);
- ubifs_assert(mutex_is_locked(&dir->i_mutex));
- ubifs_assert(mutex_is_locked(&inode->i_mutex));
+ ubifs_assert(inode_is_locked(dir));
+ ubifs_assert(inode_is_locked(inode));
err = dbg_check_synced_i_size(c, inode);
if (err)
@@ -571,8 +572,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
dentry, inode->i_ino,
inode->i_nlink, dir->i_ino);
- ubifs_assert(mutex_is_locked(&dir->i_mutex));
- ubifs_assert(mutex_is_locked(&inode->i_mutex));
+ ubifs_assert(inode_is_locked(dir));
+ ubifs_assert(inode_is_locked(inode));
err = dbg_check_synced_i_size(c, inode);
if (err)
return err;
@@ -660,8 +661,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry,
inode->i_ino, dir->i_ino);
- ubifs_assert(mutex_is_locked(&dir->i_mutex));
- ubifs_assert(mutex_is_locked(&inode->i_mutex));
+ ubifs_assert(inode_is_locked(dir));
+ ubifs_assert(inode_is_locked(inode));
err = check_dir_empty(c, d_inode(dentry));
if (err)
return err;
@@ -787,9 +788,6 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino);
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
if (S_ISBLK(mode) || S_ISCHR(mode)) {
dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
if (!dev)
@@ -998,10 +996,10 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu",
old_dentry, old_inode->i_ino, old_dir->i_ino,
new_dentry, new_dir->i_ino);
- ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
- ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+ ubifs_assert(inode_is_locked(old_dir));
+ ubifs_assert(inode_is_locked(new_dir));
if (unlink)
- ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
+ ubifs_assert(inode_is_locked(new_inode));
if (unlink && is_dir) {
@@ -1188,6 +1186,9 @@ const struct inode_operations ubifs_dir_inode_operations = {
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct file_operations ubifs_dir_operations = {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index a3dfe2ae79f2..446753d8ac34 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -121,7 +121,7 @@ static int do_readpage(struct page *page)
if (block >= beyond) {
/* Reading beyond inode */
SetPageChecked(page);
- memset(addr, 0, PAGE_CACHE_SIZE);
+ memset(addr, 0, PAGE_SIZE);
goto out;
}
@@ -223,7 +223,7 @@ static int write_begin_slow(struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct ubifs_budget_req req = { .new_page = 1 };
int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
struct page *page;
@@ -254,13 +254,13 @@ static int write_begin_slow(struct address_space *mapping,
}
if (!PageUptodate(page)) {
- if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+ if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE)
SetPageChecked(page);
else {
err = do_readpage(page);
if (err) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
ubifs_release_budget(c, &req);
return err;
}
@@ -428,7 +428,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_inode *ui = ubifs_inode(inode);
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
int skipped_read = 0;
struct page *page;
@@ -446,7 +446,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
if (!PageUptodate(page)) {
/* The page is not loaded from the flash */
- if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
+ if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) {
/*
* We change whole page so no need to load it. But we
* do not know whether this page exists on the media or
@@ -462,7 +462,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
err = do_readpage(page);
if (err) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return err;
}
}
@@ -494,7 +494,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
mutex_unlock(&ui->ui_mutex);
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return write_begin_slow(mapping, pos, len, pagep, flags);
}
@@ -549,12 +549,12 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
inode->i_ino, pos, page->index, len, copied, inode->i_size);
- if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
+ if (unlikely(copied < len && len == PAGE_SIZE)) {
/*
* VFS copied less data to the page that it intended and
* declared in its '->write_begin()' call via the @len
* argument. If the page was not up-to-date, and @len was
- * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did
+ * @PAGE_SIZE, the 'ubifs_write_begin()' function did
* not load it from the media (for optimization reasons). This
* means that part of the page contains garbage. So read the
* page now.
@@ -593,7 +593,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
@@ -621,10 +621,10 @@ static int populate_page(struct ubifs_info *c, struct page *page,
addr = zaddr = kmap(page);
- end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (i_size - 1) >> PAGE_SHIFT;
if (!i_size || page->index > end_index) {
hole = 1;
- memset(addr, 0, PAGE_CACHE_SIZE);
+ memset(addr, 0, PAGE_SIZE);
goto out_hole;
}
@@ -673,7 +673,7 @@ static int populate_page(struct ubifs_info *c, struct page *page,
}
if (end_index == page->index) {
- int len = i_size & (PAGE_CACHE_SIZE - 1);
+ int len = i_size & (PAGE_SIZE - 1);
if (len && len < read)
memset(zaddr + len, 0, read - len);
@@ -773,7 +773,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
isize = i_size_read(inode);
if (isize == 0)
goto out_free;
- end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+ end_index = ((isize - 1) >> PAGE_SHIFT);
for (page_idx = 1; page_idx < page_cnt; page_idx++) {
pgoff_t page_offset = offset + page_idx;
@@ -788,7 +788,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
if (!PageUptodate(page))
err = populate_page(c, page, bu, &n);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if (err)
break;
}
@@ -905,7 +905,7 @@ static int do_writepage(struct page *page, int len)
#ifdef UBIFS_DEBUG
struct ubifs_inode *ui = ubifs_inode(inode);
spin_lock(&ui->ui_lock);
- ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT);
+ ubifs_assert(page->index <= ui->synced_i_size >> PAGE_SHIFT);
spin_unlock(&ui->ui_lock);
#endif
@@ -1001,8 +1001,8 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
struct inode *inode = page->mapping->host;
struct ubifs_inode *ui = ubifs_inode(inode);
loff_t i_size = i_size_read(inode), synced_i_size;
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
- int err, len = i_size & (PAGE_CACHE_SIZE - 1);
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
+ int err, len = i_size & (PAGE_SIZE - 1);
void *kaddr;
dbg_gen("ino %lu, pg %lu, pg flags %#lx",
@@ -1021,7 +1021,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
/* Is the page fully inside @i_size? */
if (page->index < end_index) {
- if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
+ if (page->index >= synced_i_size >> PAGE_SHIFT) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
goto out_unlock;
@@ -1034,7 +1034,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
* with this.
*/
}
- return do_writepage(page, PAGE_CACHE_SIZE);
+ return do_writepage(page, PAGE_SIZE);
}
/*
@@ -1045,7 +1045,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
* writes to that region are not written out to the file."
*/
kaddr = kmap_atomic(page);
- memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
+ memset(kaddr + len, 0, PAGE_SIZE - len);
flush_dcache_page(page);
kunmap_atomic(kaddr);
@@ -1138,7 +1138,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
truncate_setsize(inode, new_size);
if (offset) {
- pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
+ pgoff_t index = new_size >> PAGE_SHIFT;
struct page *page;
page = find_lock_page(inode->i_mapping, index);
@@ -1157,9 +1157,9 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
clear_page_dirty_for_io(page);
if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
offset = new_size &
- (PAGE_CACHE_SIZE - 1);
+ (PAGE_SIZE - 1);
err = do_writepage(page, offset);
- page_cache_release(page);
+ put_page(page);
if (err)
goto out_budg;
/*
@@ -1173,7 +1173,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
* having to read it.
*/
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
}
@@ -1285,7 +1285,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset,
struct ubifs_info *c = inode->i_sb->s_fs_info;
ubifs_assert(PagePrivate(page));
- if (offset || length < PAGE_CACHE_SIZE)
+ if (offset || length < PAGE_SIZE)
/* Partial page remains dirty */
return;
@@ -1317,7 +1317,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
err = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (err)
return err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* Synchronize the inode unless this is a 'datasync()' call. */
if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
@@ -1332,7 +1332,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
err = ubifs_sync_wbufs_by_inode(c, inode);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err;
}
@@ -1354,6 +1354,47 @@ static inline int mctime_update_needed(const struct inode *inode,
return 0;
}
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+/**
+ * ubifs_update_time - update time of inode.
+ * @inode: inode to update
+ *
+ * This function updates time of the inode.
+ */
+int ubifs_update_time(struct inode *inode, struct timespec *time,
+ int flags)
+{
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
+ int iflags = I_DIRTY_TIME;
+ int err, release;
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ mutex_lock(&ui->ui_mutex);
+ if (flags & S_ATIME)
+ inode->i_atime = *time;
+ if (flags & S_CTIME)
+ inode->i_ctime = *time;
+ if (flags & S_MTIME)
+ inode->i_mtime = *time;
+
+ if (!(inode->i_sb->s_flags & MS_LAZYTIME))
+ iflags |= I_DIRTY_SYNC;
+
+ release = ui->dirty;
+ __mark_inode_dirty(inode, iflags);
+ mutex_unlock(&ui->ui_mutex);
+ if (release)
+ ubifs_release_budget(c, &req);
+ return 0;
+}
+#endif
+
/**
* update_ctime - update mtime and ctime of an inode.
* @inode: inode to update
@@ -1537,6 +1578,9 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (err)
return err;
vma->vm_ops = &ubifs_file_vm_ops;
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ file_accessed(file);
+#endif
return 0;
}
@@ -1557,17 +1601,23 @@ const struct inode_operations ubifs_file_inode_operations = {
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct inode_operations ubifs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
.setxattr = ubifs_setxattr,
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct file_operations ubifs_file_operations = {
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 92a8491a8f8c..c0a95e393347 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -34,6 +34,12 @@
* node. We use "r5" hash borrowed from reiserfs.
*/
+/*
+ * Lot's of the key helpers require a struct ubifs_info *c as the first parameter.
+ * But we are not using it at all currently. That's designed for future extensions of
+ * different c->key_format. But right now, there is only one key type, UBIFS_SIMPLE_KEY_FMT.
+ */
+
#ifndef __UBIFS_KEY_H__
#define __UBIFS_KEY_H__
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index dc9f27e9d61b..9a517109da0f 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1498,11 +1498,10 @@ static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
}
/* nnode is being committed, so copy it */
- n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+ n = kmemdup(nnode, sizeof(struct ubifs_nnode), GFP_NOFS);
if (unlikely(!n))
return ERR_PTR(-ENOMEM);
- memcpy(n, nnode, sizeof(struct ubifs_nnode));
n->cnext = NULL;
__set_bit(DIRTY_CNODE, &n->flags);
__clear_bit(COW_CNODE, &n->flags);
@@ -1549,11 +1548,10 @@ static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
}
/* pnode is being committed, so copy it */
- p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+ p = kmemdup(pnode, sizeof(struct ubifs_pnode), GFP_NOFS);
if (unlikely(!p))
return ERR_PTR(-ENOMEM);
- memcpy(p, pnode, sizeof(struct ubifs_pnode));
p->cnext = NULL;
__set_bit(DIRTY_CNODE, &p->flags);
__clear_bit(COW_CNODE, &p->flags);
diff --git a/fs/ubifs/misc.c b/fs/ubifs/misc.c
new file mode 100644
index 000000000000..486a2844949f
--- /dev/null
+++ b/fs/ubifs/misc.c
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include "ubifs.h"
+
+/* Normal UBIFS messages */
+void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_notice("UBIFS (ubi%d:%d): %pV\n",
+ c->vi.ubi_num, c->vi.vol_id, &vaf);
+
+ va_end(args);
+} \
+
+/* UBIFS error messages */
+void ubifs_err(const struct ubifs_info *c, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_err("UBIFS error (ubi%d:%d pid %d): %ps: %pV\n",
+ c->vi.ubi_num, c->vi.vol_id, current->pid,
+ __builtin_return_address(0),
+ &vaf);
+
+ va_end(args);
+} \
+
+/* UBIFS warning messages */
+void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_warn("UBIFS warning (ubi%d:%d pid %d): %ps: %pV\n",
+ c->vi.ubi_num, c->vi.vol_id, current->pid,
+ __builtin_return_address(0),
+ &vaf);
+
+ va_end(args);
+}
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index ee7cb5ebb6e8..8ece6ca58c0b 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -155,13 +155,8 @@ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
*/
static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
{
- if (new_valid_dev(rdev)) {
- dev->new = cpu_to_le32(new_encode_dev(rdev));
- return sizeof(dev->new);
- } else {
- dev->huge = cpu_to_le64(huge_encode_dev(rdev));
- return sizeof(dev->huge);
- }
+ dev->new = cpu_to_le32(new_encode_dev(rdev));
+ return sizeof(dev->new);
}
/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 695fc71d5244..586d59347fff 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -789,7 +789,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
corrupted_rescan:
/* Re-scan the corrupted data with verbose messages */
ubifs_err(c, "corruption %d", ret);
- ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+ ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
corrupted:
ubifs_scanned_corruption(c, lnum, offs, buf);
err = -EUCLEAN;
@@ -1331,8 +1331,7 @@ void ubifs_destroy_size_tree(struct ubifs_info *c)
struct size_entry *e, *n;
rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
- if (e->inode)
- iput(e->inode);
+ iput(e->inode);
kfree(e);
}
@@ -1533,8 +1532,7 @@ int ubifs_recover_size(struct ubifs_info *c)
err = fix_size_in_place(c, e);
if (err)
return err;
- if (e->inode)
- iput(e->inode);
+ iput(e->inode);
}
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9547a27868ad..e98c24ee25a1 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -128,7 +128,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
if (err)
goto out_ino;
- inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+ inode->i_flags |= S_NOCMTIME;
+#ifndef CONFIG_UBIFS_ATIME_SUPPORT
+ inode->i_flags |= S_NOATIME;
+#endif
set_nlink(inode, le32_to_cpu(ino->nlink));
i_uid_write(inode, le32_to_cpu(ino->uid));
i_gid_write(inode, le32_to_cpu(ino->gid));
@@ -2037,7 +2040,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
if (c->max_inode_sz > MAX_LFS_FILESIZE)
sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
sb->s_op = &ubifs_super_operations;
- sb->s_xattr = ubifs_xattr_handlers;
mutex_lock(&c->umount_mutex);
err = mount_ubifs(c);
@@ -2139,7 +2141,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
if (err)
goto out_deact;
/* We do not support atime */
- sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+ sb->s_flags |= MS_ACTIVE;
+#ifndef CONFIG_UBIFS_ATIME_SUPPORT
+ sb->s_flags |= MS_NOATIME;
+#else
+ ubifs_msg(c, "full atime support is enabled.");
+#endif
}
/* 'fill_super()' opens ubi again so we must close it here */
@@ -2230,19 +2237,19 @@ static int __init ubifs_init(void)
BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
/*
- * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
+ * We require that PAGE_SIZE is greater-than-or-equal-to
* UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
*/
- if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
+ if (PAGE_SIZE < UBIFS_BLOCK_SIZE) {
pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes",
- current->pid, (unsigned int)PAGE_CACHE_SIZE);
+ current->pid, (unsigned int)PAGE_SIZE);
return -EINVAL;
}
ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
sizeof(struct ubifs_inode), 0,
- SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
- &inode_slab_ctor);
+ SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT, &inode_slab_ctor);
if (!ubifs_inode_slab)
return -ENOMEM;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 957f5757f374..fa9a20cc60d6 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -198,11 +198,10 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c,
{
struct ubifs_znode *zn;
- zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+ zn = kmemdup(znode, c->max_znode_sz, GFP_NOFS);
if (unlikely(!zn))
return ERR_PTR(-ENOMEM);
- memcpy(zn, znode, c->max_znode_sz);
zn->cnext = NULL;
__set_bit(DIRTY_ZNODE, &zn->flags);
__clear_bit(COW_ZNODE, &zn->flags);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index de759022f3d6..4cd7e569cd00 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -42,36 +42,12 @@
/* Version of this UBIFS implementation */
#define UBIFS_VERSION 1
-/* Normal UBIFS messages */
-#define ubifs_msg(c, fmt, ...) \
- pr_notice("UBIFS (ubi%d:%d): " fmt "\n", \
- (c)->vi.ubi_num, (c)->vi.vol_id, ##__VA_ARGS__)
-/* UBIFS error messages */
-#define ubifs_err(c, fmt, ...) \
- pr_err("UBIFS error (ubi%d:%d pid %d): %s: " fmt "\n", \
- (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \
- __func__, ##__VA_ARGS__)
-/* UBIFS warning messages */
-#define ubifs_warn(c, fmt, ...) \
- pr_warn("UBIFS warning (ubi%d:%d pid %d): %s: " fmt "\n", \
- (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \
- __func__, ##__VA_ARGS__)
-/*
- * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
- * object as an argument.
- */
-#define ubifs_errc(c, fmt, ...) \
- do { \
- if (!(c)->probing) \
- ubifs_err(c, fmt, ##__VA_ARGS__); \
- } while (0)
-
/* UBIFS file system VFS magic number */
#define UBIFS_SUPER_MAGIC 0x24051905
/* Number of UBIFS blocks per VFS page */
-#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
-#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
+#define UBIFS_BLOCKS_PER_PAGE (PAGE_SIZE / UBIFS_BLOCK_SIZE)
+#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_SHIFT - UBIFS_BLOCK_SHIFT)
/* "File system end of life" sequence number watermark */
#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
@@ -858,9 +834,9 @@ struct ubifs_compressor {
* @mod_dent: non-zero if the operation removes or modifies an existing
* directory entry
* @new_ino: non-zero if the operation adds a new inode
- * @new_ino_d: now much data newly created inode contains
+ * @new_ino_d: how much data newly created inode contains
* @dirtied_ino: how many inodes the operation makes dirty
- * @dirtied_ino_d: now much data dirtied inode contains
+ * @dirtied_ino_d: how much data dirtied inode contains
* @idx_growth: how much the index will supposedly grow
* @data_growth: how much new data the operation will supposedly add
* @dd_growth: how much data that makes other data dirty the operation will
@@ -1470,7 +1446,6 @@ extern spinlock_t ubifs_infos_lock;
extern atomic_long_t ubifs_clean_zn_cnt;
extern struct kmem_cache *ubifs_inode_slab;
extern const struct super_operations ubifs_super_operations;
-extern const struct xattr_handler *ubifs_xattr_handlers[];
extern const struct address_space_operations ubifs_file_address_operations;
extern const struct file_operations ubifs_file_operations;
extern const struct inode_operations ubifs_file_inode_operations;
@@ -1746,6 +1721,9 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc);
/* file.c */
int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
+#endif
/* dir.c */
struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
@@ -1800,4 +1778,21 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
#include "misc.h"
#include "key.h"
+/* Normal UBIFS messages */
+__printf(2, 3)
+void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
+__printf(2, 3)
+void ubifs_err(const struct ubifs_info *c, const char *fmt, ...);
+__printf(2, 3)
+void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
+/*
+ * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
+ * object as an argument.
+ */
+#define ubifs_errc(c, fmt, ...) \
+do { \
+ if (!(c)->probing) \
+ ubifs_err(c, fmt, ##__VA_ARGS__); \
+} while (0)
+
#endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index fd65b3f1923c..b043e044121d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -59,7 +59,6 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
-#include <linux/posix_acl_xattr.h>
/*
* Limit the number of extended attributes per inode so that the total size
@@ -200,6 +199,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
int err;
struct ubifs_inode *host_ui = ubifs_inode(host);
struct ubifs_inode *ui = ubifs_inode(inode);
+ void *buf = NULL;
struct ubifs_budget_req req = { .dirtied_ino = 2,
.dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
@@ -208,14 +208,17 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
if (err)
return err;
- kfree(ui->data);
- ui->data = kmemdup(value, size, GFP_NOFS);
- if (!ui->data) {
+ buf = kmemdup(value, size, GFP_NOFS);
+ if (!buf) {
err = -ENOMEM;
goto out_free;
}
+ mutex_lock(&ui->ui_mutex);
+ kfree(ui->data);
+ ui->data = buf;
inode->i_size = ui->ui_size = size;
ui->data_len = size;
+ mutex_unlock(&ui->ui_mutex);
mutex_lock(&host_ui->ui_mutex);
host->i_ctime = ubifs_current_time(host);
@@ -263,7 +266,7 @@ static int check_namespace(const struct qstr *nm)
if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
XATTR_TRUSTED_PREFIX_LEN)) {
- if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
+ if (nm->name[XATTR_TRUSTED_PREFIX_LEN] == '\0')
return -EINVAL;
type = TRUSTED_XATTR;
} else if (!strncmp(nm->name, XATTR_USER_PREFIX,
@@ -273,7 +276,7 @@ static int check_namespace(const struct qstr *nm)
type = USER_XATTR;
} else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN)) {
- if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
+ if (nm->name[XATTR_SECURITY_PREFIX_LEN] == '\0')
return -EINVAL;
type = SECURITY_XATTR;
} else
@@ -309,7 +312,7 @@ static int setxattr(struct inode *host, const char *name, const void *value,
union ubifs_key key;
int err, type;
- ubifs_assert(mutex_is_locked(&host->i_mutex));
+ ubifs_assert(inode_is_locked(host));
if (size > UBIFS_MAX_INO_DATA)
return -ERANGE;
@@ -409,6 +412,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
ubifs_assert(inode->i_size == ui->data_len);
ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
+ mutex_lock(&ui->ui_mutex);
if (buf) {
/* If @buf is %NULL we are supposed to return the length */
if (ui->data_len > size) {
@@ -423,6 +427,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
err = ui->data_len;
out_iput:
+ mutex_unlock(&ui->ui_mutex);
iput(inode);
out_unlock:
kfree(xent);
@@ -544,7 +549,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
dbg_gen("xattr '%s', ino %lu ('%pd')", name,
host->i_ino, dentry);
- ubifs_assert(mutex_is_locked(&host->i_mutex));
+ ubifs_assert(inode_is_locked(host));
err = check_namespace(&nm);
if (err < 0)
@@ -582,46 +587,6 @@ out_free:
return err;
}
-static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
- const char *name, size_t name_len, int flags)
-{
- const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
-
- return total_len;
-}
-
-static int security_getxattr(struct dentry *d, const char *name, void *buffer,
- size_t size, int flags)
-{
- return ubifs_getxattr(d, name, buffer, size);
-}
-
-static int security_setxattr(struct dentry *d, const char *name,
- const void *value, size_t size, int flags,
- int handler_flags)
-{
- return ubifs_setxattr(d, name, value, size, flags);
-}
-
-static const struct xattr_handler ubifs_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .list = security_listxattr,
- .get = security_getxattr,
- .set = security_setxattr,
-};
-
-const struct xattr_handler *ubifs_xattr_handlers[] = {
- &ubifs_xattr_security_handler,
- NULL,
-};
-
static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
{
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 6d6a96b4e73f..e0fd65fe73e8 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -447,9 +447,6 @@ static void udf_table_free_blocks(struct super_block *sb,
*/
int adsize;
- struct short_ad *sad = NULL;
- struct long_ad *lad = NULL;
- struct allocExtDesc *aed;
eloc.logicalBlockNum = start;
elen = EXT_RECORDED_ALLOCATED |
@@ -466,102 +463,17 @@ static void udf_table_free_blocks(struct super_block *sb,
}
if (epos.offset + (2 * adsize) > sb->s_blocksize) {
- unsigned char *sptr, *dptr;
- int loffset;
-
- brelse(oepos.bh);
- oepos = epos;
-
/* Steal a block from the extent being free'd */
- epos.block.logicalBlockNum = eloc.logicalBlockNum;
+ udf_setup_indirect_aext(table, eloc.logicalBlockNum,
+ &epos);
+
eloc.logicalBlockNum++;
elen -= sb->s_blocksize;
-
- epos.bh = udf_tread(sb,
- udf_get_lb_pblock(sb, &epos.block, 0));
- if (!epos.bh) {
- brelse(oepos.bh);
- goto error_return;
- }
- aed = (struct allocExtDesc *)(epos.bh->b_data);
- aed->previousAllocExtLocation =
- cpu_to_le32(oepos.block.logicalBlockNum);
- if (epos.offset + adsize > sb->s_blocksize) {
- loffset = epos.offset;
- aed->lengthAllocDescs = cpu_to_le32(adsize);
- sptr = iinfo->i_ext.i_data + epos.offset
- - adsize;
- dptr = epos.bh->b_data +
- sizeof(struct allocExtDesc);
- memcpy(dptr, sptr, adsize);
- epos.offset = sizeof(struct allocExtDesc) +
- adsize;
- } else {
- loffset = epos.offset + adsize;
- aed->lengthAllocDescs = cpu_to_le32(0);
- if (oepos.bh) {
- sptr = oepos.bh->b_data + epos.offset;
- aed = (struct allocExtDesc *)
- oepos.bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs,
- adsize);
- } else {
- sptr = iinfo->i_ext.i_data +
- epos.offset;
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(table);
- }
- epos.offset = sizeof(struct allocExtDesc);
- }
- if (sbi->s_udfrev >= 0x0200)
- udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
- 3, 1, epos.block.logicalBlockNum,
- sizeof(struct tag));
- else
- udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
- 2, 1, epos.block.logicalBlockNum,
- sizeof(struct tag));
-
- switch (iinfo->i_alloc_type) {
- case ICBTAG_FLAG_AD_SHORT:
- sad = (struct short_ad *)sptr;
- sad->extLength = cpu_to_le32(
- EXT_NEXT_EXTENT_ALLOCDECS |
- sb->s_blocksize);
- sad->extPosition =
- cpu_to_le32(epos.block.logicalBlockNum);
- break;
- case ICBTAG_FLAG_AD_LONG:
- lad = (struct long_ad *)sptr;
- lad->extLength = cpu_to_le32(
- EXT_NEXT_EXTENT_ALLOCDECS |
- sb->s_blocksize);
- lad->extLocation =
- cpu_to_lelb(epos.block);
- break;
- }
- if (oepos.bh) {
- udf_update_tag(oepos.bh->b_data, loffset);
- mark_buffer_dirty(oepos.bh);
- } else {
- mark_inode_dirty(table);
- }
}
/* It's possible that stealing the block emptied the extent */
- if (elen) {
- udf_write_aext(table, &epos, &eloc, elen, 1);
-
- if (!epos.bh) {
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(table);
- } else {
- aed = (struct allocExtDesc *)epos.bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs, adsize);
- udf_update_tag(epos.bh->b_data, epos.offset);
- mark_buffer_dirty(epos.bh);
- }
- }
+ if (elen)
+ __udf_add_aext(table, &epos, &eloc, elen, 1);
}
brelse(epos.bh);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 541d9c65014d..b51b371b874a 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -45,7 +45,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
int block, iblock;
loff_t nf_pos;
int flen;
- unsigned char *fname = NULL;
+ unsigned char *fname = NULL, *copy_name = NULL;
unsigned char *nameptr;
uint16_t liu;
uint8_t lfi;
@@ -143,7 +143,15 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
if (poffset >= lfi) {
nameptr = (char *)(fibh.ebh->b_data + poffset - lfi);
} else {
- nameptr = fname;
+ if (!copy_name) {
+ copy_name = kmalloc(UDF_NAME_LEN,
+ GFP_NOFS);
+ if (!copy_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ nameptr = copy_name;
memcpy(nameptr, fi->fileIdent + liu,
lfi - poffset);
memcpy(nameptr + lfi - poffset,
@@ -185,6 +193,7 @@ out:
brelse(fibh.sbh);
brelse(epos.bh);
kfree(fname);
+ kfree(copy_name);
return ret;
}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index bddf3d071dae..877ba1c9b461 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -46,7 +46,7 @@ static void __udf_adinicb_readpage(struct page *page)
kaddr = kmap(page);
memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
- memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size);
+ memset(kaddr + inode->i_size, 0, PAGE_SIZE - inode->i_size);
flush_dcache_page(page);
SetPageUptodate(page);
kunmap(page);
@@ -87,14 +87,14 @@ static int udf_adinicb_write_begin(struct file *file,
{
struct page *page;
- if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE))
+ if (WARN_ON_ONCE(pos >= PAGE_SIZE))
return -EIO;
page = grab_cache_page_write_begin(mapping, 0, flags);
if (!page)
return -ENOMEM;
*pagep = page;
- if (!PageUptodate(page) && len != PAGE_CACHE_SIZE)
+ if (!PageUptodate(page) && len != PAGE_SIZE)
__udf_adinicb_readpage(page);
return 0;
}
@@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct udf_inode_info *iinfo = UDF_I(inode);
int err;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
retval = generic_write_checks(iocb, from);
if (retval <= 0)
@@ -136,7 +136,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
(udf_file_entry_alloc_offset(inode) + end)) {
err = udf_expand_file_adinicb(inode);
if (err) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
udf_debug("udf_expand_adinicb: err=%d\n", err);
return err;
}
@@ -149,7 +149,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
retval = __generic_file_write_iter(iocb, from);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (retval > 0) {
mark_inode_dirty(inode);
@@ -223,12 +223,12 @@ static int udf_release_file(struct inode *inode, struct file *filp)
* Grab i_mutex to avoid races with writes changing i_size
* while we are running.
*/
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
down_write(&UDF_I(inode)->i_data_sem);
udf_discard_prealloc(inode);
udf_truncate_tail_extent(inode);
up_write(&UDF_I(inode)->i_data_sem);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
return 0;
}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8d0b3ade0ff0..2dc461eeb415 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -262,7 +262,7 @@ int udf_expand_file_adinicb(struct inode *inode)
.nr_to_write = 1,
};
- WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+ WARN_ON_ONCE(!inode_is_locked(inode));
if (!iinfo->i_lenAlloc) {
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
@@ -287,7 +287,7 @@ int udf_expand_file_adinicb(struct inode *inode)
if (!PageUptodate(page)) {
kaddr = kmap(page);
memset(kaddr + iinfo->i_lenAlloc, 0x00,
- PAGE_CACHE_SIZE - iinfo->i_lenAlloc);
+ PAGE_SIZE - iinfo->i_lenAlloc);
memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr,
iinfo->i_lenAlloc);
flush_dcache_page(page);
@@ -319,7 +319,7 @@ int udf_expand_file_adinicb(struct inode *inode)
inode->i_data.a_ops = &udf_adinicb_aops;
up_write(&iinfo->i_data_sem);
}
- page_cache_release(page);
+ put_page(page);
mark_inode_dirty(inode);
return err;
@@ -539,9 +539,18 @@ static int udf_do_extend_file(struct inode *inode,
udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
count++;
- } else
+ } else {
+ struct kernel_lb_addr tmploc;
+ uint32_t tmplen;
+
udf_write_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
+ /*
+ * We've rewritten the last extent but there may be empty
+ * indirect extent after it - enter it.
+ */
+ udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
+ }
/* Managed to do everything necessary? */
if (!blocks)
@@ -1540,7 +1549,8 @@ reread:
break;
case ICBTAG_FILE_TYPE_SYMLINK:
inode->i_data.a_ops = &udf_symlink_aops;
- inode->i_op = &udf_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
break;
case ICBTAG_FILE_TYPE_MAIN:
@@ -1866,22 +1876,90 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
return inode;
}
-int udf_add_aext(struct inode *inode, struct extent_position *epos,
- struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+int udf_setup_indirect_aext(struct inode *inode, int block,
+ struct extent_position *epos)
{
- int adsize;
- struct short_ad *sad = NULL;
- struct long_ad *lad = NULL;
+ struct super_block *sb = inode->i_sb;
+ struct buffer_head *bh;
struct allocExtDesc *aed;
- uint8_t *ptr;
- struct udf_inode_info *iinfo = UDF_I(inode);
+ struct extent_position nepos;
+ struct kernel_lb_addr neloc;
+ int ver, adsize;
- if (!epos->bh)
- ptr = iinfo->i_ext.i_data + epos->offset -
- udf_file_entry_alloc_offset(inode) +
- iinfo->i_lenEAttr;
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+ adsize = sizeof(struct short_ad);
+ else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+ adsize = sizeof(struct long_ad);
else
- ptr = epos->bh->b_data + epos->offset;
+ return -EIO;
+
+ neloc.logicalBlockNum = block;
+ neloc.partitionReferenceNum = epos->block.partitionReferenceNum;
+
+ bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0));
+ if (!bh)
+ return -EIO;
+ lock_buffer(bh);
+ memset(bh->b_data, 0x00, sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ mark_buffer_dirty_inode(bh, inode);
+
+ aed = (struct allocExtDesc *)(bh->b_data);
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) {
+ aed->previousAllocExtLocation =
+ cpu_to_le32(epos->block.logicalBlockNum);
+ }
+ aed->lengthAllocDescs = cpu_to_le32(0);
+ if (UDF_SB(sb)->s_udfrev >= 0x0200)
+ ver = 3;
+ else
+ ver = 2;
+ udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block,
+ sizeof(struct tag));
+
+ nepos.block = neloc;
+ nepos.offset = sizeof(struct allocExtDesc);
+ nepos.bh = bh;
+
+ /*
+ * Do we have to copy current last extent to make space for indirect
+ * one?
+ */
+ if (epos->offset + adsize > sb->s_blocksize) {
+ struct kernel_lb_addr cp_loc;
+ uint32_t cp_len;
+ int cp_type;
+
+ epos->offset -= adsize;
+ cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0);
+ cp_len |= ((uint32_t)cp_type) << 30;
+
+ __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1);
+ udf_write_aext(inode, epos, &nepos.block,
+ sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+ } else {
+ __udf_add_aext(inode, epos, &nepos.block,
+ sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+ }
+
+ brelse(epos->bh);
+ *epos = nepos;
+
+ return 0;
+}
+
+/*
+ * Append extent at the given position - should be the first free one in inode
+ * / indirect extent. This function assumes there is enough space in the inode
+ * or indirect extent. Use udf_add_aext() if you didn't check for this before.
+ */
+int __udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+ struct udf_inode_info *iinfo = UDF_I(inode);
+ struct allocExtDesc *aed;
+ int adsize;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
adsize = sizeof(struct short_ad);
@@ -1890,88 +1968,14 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos,
else
return -EIO;
- if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
- unsigned char *sptr, *dptr;
- struct buffer_head *nbh;
- int err, loffset;
- struct kernel_lb_addr obloc = epos->block;
-
- epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
- obloc.partitionReferenceNum,
- obloc.logicalBlockNum, &err);
- if (!epos->block.logicalBlockNum)
- return -ENOSPC;
- nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
- &epos->block,
- 0));
- if (!nbh)
- return -EIO;
- lock_buffer(nbh);
- memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
- set_buffer_uptodate(nbh);
- unlock_buffer(nbh);
- mark_buffer_dirty_inode(nbh, inode);
-
- aed = (struct allocExtDesc *)(nbh->b_data);
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
- aed->previousAllocExtLocation =
- cpu_to_le32(obloc.logicalBlockNum);
- if (epos->offset + adsize > inode->i_sb->s_blocksize) {
- loffset = epos->offset;
- aed->lengthAllocDescs = cpu_to_le32(adsize);
- sptr = ptr - adsize;
- dptr = nbh->b_data + sizeof(struct allocExtDesc);
- memcpy(dptr, sptr, adsize);
- epos->offset = sizeof(struct allocExtDesc) + adsize;
- } else {
- loffset = epos->offset + adsize;
- aed->lengthAllocDescs = cpu_to_le32(0);
- sptr = ptr;
- epos->offset = sizeof(struct allocExtDesc);
-
- if (epos->bh) {
- aed = (struct allocExtDesc *)epos->bh->b_data;
- le32_add_cpu(&aed->lengthAllocDescs, adsize);
- } else {
- iinfo->i_lenAlloc += adsize;
- mark_inode_dirty(inode);
- }
- }
- if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
- udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
- epos->block.logicalBlockNum, sizeof(struct tag));
- else
- udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
- epos->block.logicalBlockNum, sizeof(struct tag));
- switch (iinfo->i_alloc_type) {
- case ICBTAG_FLAG_AD_SHORT:
- sad = (struct short_ad *)sptr;
- sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
- inode->i_sb->s_blocksize);
- sad->extPosition =
- cpu_to_le32(epos->block.logicalBlockNum);
- break;
- case ICBTAG_FLAG_AD_LONG:
- lad = (struct long_ad *)sptr;
- lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
- inode->i_sb->s_blocksize);
- lad->extLocation = cpu_to_lelb(epos->block);
- memset(lad->impUse, 0x00, sizeof(lad->impUse));
- break;
- }
- if (epos->bh) {
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
- UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
- udf_update_tag(epos->bh->b_data, loffset);
- else
- udf_update_tag(epos->bh->b_data,
- sizeof(struct allocExtDesc));
- mark_buffer_dirty_inode(epos->bh, inode);
- brelse(epos->bh);
- } else {
- mark_inode_dirty(inode);
- }
- epos->bh = nbh;
+ if (!epos->bh) {
+ WARN_ON(iinfo->i_lenAlloc !=
+ epos->offset - udf_file_entry_alloc_offset(inode));
+ } else {
+ aed = (struct allocExtDesc *)epos->bh->b_data;
+ WARN_ON(le32_to_cpu(aed->lengthAllocDescs) !=
+ epos->offset - sizeof(struct allocExtDesc));
+ WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize);
}
udf_write_aext(inode, epos, eloc, elen, inc);
@@ -1995,6 +1999,41 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos,
return 0;
}
+/*
+ * Append extent at given position - should be the first free one in inode
+ * / indirect extent. Takes care of allocating and linking indirect blocks.
+ */
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+ int adsize;
+ struct super_block *sb = inode->i_sb;
+
+ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+ adsize = sizeof(struct short_ad);
+ else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+ adsize = sizeof(struct long_ad);
+ else
+ return -EIO;
+
+ if (epos->offset + (2 * adsize) > sb->s_blocksize) {
+ int err;
+ int new_block;
+
+ new_block = udf_new_block(sb, NULL,
+ epos->block.partitionReferenceNum,
+ epos->block.logicalBlockNum, &err);
+ if (!new_block)
+ return -ENOSPC;
+
+ err = udf_setup_indirect_aext(inode, new_block, epos);
+ if (err)
+ return err;
+ }
+
+ return __udf_add_aext(inode, epos, eloc, elen, inc);
+}
+
void udf_write_aext(struct inode *inode, struct extent_position *epos,
struct kernel_lb_addr *eloc, uint32_t elen, int inc)
{
@@ -2047,14 +2086,29 @@ void udf_write_aext(struct inode *inode, struct extent_position *epos,
epos->offset += adsize;
}
+/*
+ * Only 1 indirect extent in a row really makes sense but allow upto 16 in case
+ * someone does some weird stuff.
+ */
+#define UDF_MAX_INDIR_EXTS 16
+
int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
struct kernel_lb_addr *eloc, uint32_t *elen, int inc)
{
int8_t etype;
+ unsigned int indirections = 0;
while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) ==
(EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
int block;
+
+ if (++indirections > UDF_MAX_INDIR_EXTS) {
+ udf_err(inode->i_sb,
+ "too many indirect extents in inode %lu\n",
+ inode->i_ino);
+ return -1;
+ }
+
epos->block = *eloc;
epos->offset = sizeof(struct allocExtDesc);
brelse(epos->bh);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index c97b5a8d1e24..a2ba11eca995 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -165,7 +165,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
struct fileIdentDesc *fi = NULL;
loff_t f_pos;
int block, flen;
- unsigned char *fname = NULL;
+ unsigned char *fname = NULL, *copy_name = NULL;
unsigned char *nameptr;
uint8_t lfi;
uint16_t liu;
@@ -236,7 +236,15 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
nameptr = (uint8_t *)(fibh->ebh->b_data +
poffset - lfi);
else {
- nameptr = fname;
+ if (!copy_name) {
+ copy_name = kmalloc(UDF_NAME_LEN,
+ GFP_NOFS);
+ if (!copy_name) {
+ fi = ERR_PTR(-ENOMEM);
+ goto out_err;
+ }
+ }
+ nameptr = copy_name;
memcpy(nameptr, fi->fileIdent + liu,
lfi - poffset);
memcpy(nameptr + lfi - poffset,
@@ -279,6 +287,7 @@ out_err:
out_ok:
brelse(epos.bh);
kfree(fname);
+ kfree(copy_name);
return fi;
}
@@ -291,7 +300,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
struct udf_fileident_bh fibh;
struct fileIdentDesc *fi;
- if (dentry->d_name.len > UDF_NAME_LEN - 2)
+ if (dentry->d_name.len > UDF_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
#ifdef UDF_RECOVERY
@@ -351,7 +360,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
struct udf_inode_info *dinfo;
fibh->sbh = fibh->ebh = NULL;
- name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
if (!name) {
*err = -ENOMEM;
goto out_err;
@@ -362,8 +371,9 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
*err = -EINVAL;
goto out_err;
}
- namelen = udf_put_filename(sb, dentry->d_name.name, name,
- dentry->d_name.len);
+ namelen = udf_put_filename(sb, dentry->d_name.name,
+ dentry->d_name.len,
+ name, UDF_NAME_LEN_CS0);
if (!namelen) {
*err = -ENAMETOOLONG;
goto out_err;
@@ -914,14 +924,15 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
iinfo = UDF_I(inode);
down_write(&iinfo->i_data_sem);
- name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+ name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
if (!name) {
err = -ENOMEM;
goto out_no_entry;
}
inode->i_data.a_ops = &udf_symlink_aops;
- inode->i_op = &udf_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
struct kernel_lb_addr eloc;
@@ -996,8 +1007,9 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
}
if (pc->componentType == 5) {
- namelen = udf_put_filename(sb, compstart, name,
- symname - compstart);
+ namelen = udf_put_filename(sb, compstart,
+ symname - compstart,
+ name, UDF_NAME_LEN_CS0);
if (!namelen)
goto out_no_entry;
@@ -1344,8 +1356,3 @@ const struct inode_operations udf_dir_inode_operations = {
.rename = udf_rename,
.tmpfile = udf_tmpfile,
};
-const struct inode_operations udf_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
-};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 81155b9b445b..fa92fe839fda 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -179,7 +179,8 @@ static int __init init_inodecache(void)
udf_inode_cachep = kmem_cache_create("udf_inode_cache",
sizeof(struct udf_inode_info),
0, (SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD |
+ SLAB_ACCOUNT),
init_once);
if (!udf_inode_cachep)
return -ENOMEM;
@@ -278,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
{
int i;
int nr_groups = bitmap->s_nr_groups;
- int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) *
- nr_groups);
for (i = 0; i < nr_groups; i++)
if (bitmap->s_block_bitmap[i])
brelse(bitmap->s_block_bitmap[i]);
- if (size <= PAGE_SIZE)
- kfree(bitmap);
- else
- vfree(bitmap);
+ kvfree(bitmap);
}
static void udf_free_partition(struct udf_part_map *map)
@@ -891,18 +887,14 @@ static int udf_find_fileset(struct super_block *sb,
static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
{
struct primaryVolDesc *pvoldesc;
- struct ustr *instr, *outstr;
+ uint8_t *outstr;
struct buffer_head *bh;
uint16_t ident;
int ret = -ENOMEM;
- instr = kmalloc(sizeof(struct ustr), GFP_NOFS);
- if (!instr)
- return -ENOMEM;
-
- outstr = kmalloc(sizeof(struct ustr), GFP_NOFS);
+ outstr = kmalloc(128, GFP_NOFS);
if (!outstr)
- goto out1;
+ return -ENOMEM;
bh = udf_read_tagged(sb, block, block, &ident);
if (!bh) {
@@ -927,31 +919,25 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
#endif
}
- if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) {
- ret = udf_CS0toUTF8(outstr, instr);
- if (ret < 0)
- goto out_bh;
+ ret = udf_CS0toUTF8(outstr, 31, pvoldesc->volIdent, 32);
+ if (ret < 0)
+ goto out_bh;
- strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name,
- outstr->u_len > 31 ? 31 : outstr->u_len);
- udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
- }
+ strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
+ udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
- if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) {
- ret = udf_CS0toUTF8(outstr, instr);
- if (ret < 0)
- goto out_bh;
+ ret = udf_CS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128);
+ if (ret < 0)
+ goto out_bh;
- udf_debug("volSetIdent[] = '%s'\n", outstr->u_name);
- }
+ outstr[ret] = 0;
+ udf_debug("volSetIdent[] = '%s'\n", outstr);
ret = 0;
out_bh:
brelse(bh);
out2:
kfree(outstr);
-out1:
- kfree(instr);
return ret;
}
@@ -1586,6 +1572,13 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
}
/*
+ * Maximum number of Terminating Descriptor redirections. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_TD_NESTING 64
+
+/*
* Process a main/reserve volume descriptor sequence.
* @block First block of first extent of the sequence.
* @lastblock Lastblock of first extent of the sequence.
@@ -1609,6 +1602,7 @@ static noinline int udf_process_sequence(
uint16_t ident;
long next_s = 0, next_e = 0;
int ret;
+ unsigned int indirections = 0;
memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
@@ -1679,6 +1673,12 @@ static noinline int udf_process_sequence(
}
break;
case TAG_IDENT_TD: /* ISO 13346 3/10.9 */
+ if (++indirections > UDF_MAX_TD_NESTING) {
+ udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING);
+ brelse(bh);
+ return -EIO;
+ }
+
vds[VDS_POS_TERMINATING_DESC].block = block;
if (next_e) {
block = next_s;
@@ -2348,7 +2348,7 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
le32_to_cpu(lvidiu->numDirs)) : 0)
+ buf->f_bfree;
buf->f_ffree = buf->f_bfree;
- buf->f_namelen = UDF_NAME_LEN - 2;
+ buf->f_namelen = UDF_NAME_LEN;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 862535b3ba58..8d619773056b 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -107,7 +107,7 @@ static int udf_symlink_filler(struct file *file, struct page *page)
struct buffer_head *bh = NULL;
unsigned char *symlink;
int err;
- unsigned char *p = kmap(page);
+ unsigned char *p = page_address(page);
struct udf_inode_info *iinfo;
uint32_t pos;
@@ -141,7 +141,6 @@ static int udf_symlink_filler(struct file *file, struct page *page)
up_read(&iinfo->i_data_sem);
SetPageUptodate(page);
- kunmap(page);
unlock_page(page);
return 0;
@@ -149,7 +148,6 @@ out_unlock_inode:
up_read(&iinfo->i_data_sem);
SetPageError(page);
out_unmap:
- kunmap(page);
unlock_page(page);
return err;
}
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 47bb3f5ca360..972b70625614 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -49,8 +49,8 @@ extern __printf(3, 4) void _udf_warn(struct super_block *sb,
#define UDF_EXTENT_FLAG_MASK 0xC0000000
#define UDF_NAME_PAD 4
-#define UDF_NAME_LEN 256
-#define UDF_PATH_LEN 1023
+#define UDF_NAME_LEN 254
+#define UDF_NAME_LEN_CS0 255
static inline size_t udf_file_entry_alloc_offset(struct inode *inode)
{
@@ -85,7 +85,6 @@ extern const struct inode_operations udf_dir_inode_operations;
extern const struct file_operations udf_dir_operations;
extern const struct inode_operations udf_file_inode_operations;
extern const struct file_operations udf_file_operations;
-extern const struct inode_operations udf_symlink_inode_operations;
extern const struct address_space_operations udf_aops;
extern const struct address_space_operations udf_adinicb_aops;
extern const struct address_space_operations udf_symlink_aops;
@@ -107,12 +106,6 @@ struct generic_desc {
__le32 volDescSeqNum;
};
-struct ustr {
- uint8_t u_cmpID;
- uint8_t u_name[UDF_NAME_LEN - 2];
- uint8_t u_len;
-};
-
/* super.c */
@@ -159,6 +152,10 @@ extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
extern long udf_block_map(struct inode *, sector_t);
extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
struct kernel_lb_addr *, uint32_t *, sector_t *);
+extern int udf_setup_indirect_aext(struct inode *inode, int block,
+ struct extent_position *epos);
+extern int __udf_add_aext(struct inode *inode, struct extent_position *epos,
+ struct kernel_lb_addr *eloc, uint32_t elen, int inc);
extern int udf_add_aext(struct inode *, struct extent_position *,
struct kernel_lb_addr *, uint32_t, int);
extern void udf_write_aext(struct inode *, struct extent_position *,
@@ -211,12 +208,11 @@ udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
}
/* unicode.c */
-extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *,
- int);
-extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
- int);
-extern int udf_build_ustr(struct ustr *, dstring *, int);
-extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
+extern int udf_get_filename(struct super_block *, const uint8_t *, int,
+ uint8_t *, int);
+extern int udf_put_filename(struct super_block *, const uint8_t *, int,
+ uint8_t *, int);
+extern int udf_CS0toUTF8(uint8_t *, int, const uint8_t *, int);
/* ialloc.c */
extern void udf_free_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index ab478e62baae..3ff42f4437f3 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,190 +28,72 @@
#include "udf_sb.h"
-static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
- int);
-
-static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
-{
- if ((!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN - 2))
- return 0;
-
- memset(dest, 0, sizeof(struct ustr));
- memcpy(dest->u_name, src, strlen);
- dest->u_cmpID = 0x08;
- dest->u_len = strlen;
-
- return strlen;
-}
-
-/*
- * udf_build_ustr
- */
-int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
+static int udf_uni2char_utf8(wchar_t uni,
+ unsigned char *out,
+ int boundlen)
{
- int usesize;
-
- if (!dest || !ptr || !size)
- return -1;
- BUG_ON(size < 2);
-
- usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
- usesize = min(usesize, size - 2);
- dest->u_cmpID = ptr[0];
- dest->u_len = usesize;
- memcpy(dest->u_name, ptr + 1, usesize);
- memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
-
- return 0;
-}
-
-/*
- * udf_build_ustr_exact
- */
-static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
-{
- memset(dest, 0, sizeof(struct ustr));
- dest->u_cmpID = ptr[0];
- dest->u_len = exactsize - 1;
- memcpy(dest->u_name, ptr + 1, exactsize - 1);
-}
-
-/*
- * udf_CS0toUTF8
- *
- * PURPOSE
- * Convert OSTA Compressed Unicode to the UTF-8 equivalent.
- *
- * PRE-CONDITIONS
- * utf Pointer to UTF-8 output buffer.
- * ocu Pointer to OSTA Compressed Unicode input buffer
- * of size UDF_NAME_LEN bytes.
- * both of type "struct ustr *"
- *
- * POST-CONDITIONS
- * <return> >= 0 on success.
- *
- * HISTORY
- * November 12, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
-int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
-{
- const uint8_t *ocu;
- uint8_t cmp_id, ocu_len;
- int i;
-
- ocu_len = ocu_i->u_len;
- if (ocu_len == 0) {
- memset(utf_o, 0, sizeof(struct ustr));
- return 0;
- }
-
- cmp_id = ocu_i->u_cmpID;
- if (cmp_id != 8 && cmp_id != 16) {
- memset(utf_o, 0, sizeof(struct ustr));
- pr_err("unknown compression code (%d) stri=%s\n",
- cmp_id, ocu_i->u_name);
- return -EINVAL;
- }
-
- ocu = ocu_i->u_name;
- utf_o->u_len = 0;
- for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
-
- /* Expand OSTA compressed Unicode to Unicode */
- uint32_t c = ocu[i++];
- if (cmp_id == 16)
- c = (c << 8) | ocu[i++];
-
- /* Compress Unicode to UTF-8 */
- if (c < 0x80U)
- utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
- else if (c < 0x800U) {
- utf_o->u_name[utf_o->u_len++] =
- (uint8_t)(0xc0 | (c >> 6));
- utf_o->u_name[utf_o->u_len++] =
- (uint8_t)(0x80 | (c & 0x3f));
- } else {
- utf_o->u_name[utf_o->u_len++] =
- (uint8_t)(0xe0 | (c >> 12));
- utf_o->u_name[utf_o->u_len++] =
- (uint8_t)(0x80 |
- ((c >> 6) & 0x3f));
- utf_o->u_name[utf_o->u_len++] =
- (uint8_t)(0x80 | (c & 0x3f));
- }
+ int u_len = 0;
+
+ if (boundlen <= 0)
+ return -ENAMETOOLONG;
+
+ if (uni < 0x80) {
+ out[u_len++] = (unsigned char)uni;
+ } else if (uni < 0x800) {
+ if (boundlen < 2)
+ return -ENAMETOOLONG;
+ out[u_len++] = (unsigned char)(0xc0 | (uni >> 6));
+ out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
+ } else {
+ if (boundlen < 3)
+ return -ENAMETOOLONG;
+ out[u_len++] = (unsigned char)(0xe0 | (uni >> 12));
+ out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f));
+ out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
}
- utf_o->u_cmpID = 8;
-
- return utf_o->u_len;
+ return u_len;
}
-/*
- *
- * udf_UTF8toCS0
- *
- * PURPOSE
- * Convert UTF-8 to the OSTA Compressed Unicode equivalent.
- *
- * DESCRIPTION
- * This routine is only called by udf_lookup().
- *
- * PRE-CONDITIONS
- * ocu Pointer to OSTA Compressed Unicode output
- * buffer of size UDF_NAME_LEN bytes.
- * utf Pointer to UTF-8 input buffer.
- * utf_len Length of UTF-8 input buffer in bytes.
- *
- * POST-CONDITIONS
- * <return> Zero on success.
- *
- * HISTORY
- * November 12, 1997 - Andrew E. Mileski
- * Written, tested, and released.
- */
-static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length)
+static int udf_char2uni_utf8(const unsigned char *in,
+ int boundlen,
+ wchar_t *uni)
{
- unsigned c, i, max_val, utf_char;
+ unsigned int utf_char;
+ unsigned char c;
int utf_cnt, u_len;
- memset(ocu, 0, sizeof(dstring) * length);
- ocu[0] = 8;
- max_val = 0xffU;
-
-try_again:
- u_len = 0U;
- utf_char = 0U;
- utf_cnt = 0U;
- for (i = 0U; i < utf->u_len; i++) {
- c = (uint8_t)utf->u_name[i];
+ utf_char = 0;
+ utf_cnt = 0;
+ for (u_len = 0; u_len < boundlen;) {
+ c = in[u_len++];
/* Complete a multi-byte UTF-8 character */
if (utf_cnt) {
- utf_char = (utf_char << 6) | (c & 0x3fU);
+ utf_char = (utf_char << 6) | (c & 0x3f);
if (--utf_cnt)
continue;
} else {
/* Check for a multi-byte UTF-8 character */
- if (c & 0x80U) {
+ if (c & 0x80) {
/* Start a multi-byte UTF-8 character */
- if ((c & 0xe0U) == 0xc0U) {
- utf_char = c & 0x1fU;
+ if ((c & 0xe0) == 0xc0) {
+ utf_char = c & 0x1f;
utf_cnt = 1;
- } else if ((c & 0xf0U) == 0xe0U) {
- utf_char = c & 0x0fU;
+ } else if ((c & 0xf0) == 0xe0) {
+ utf_char = c & 0x0f;
utf_cnt = 2;
- } else if ((c & 0xf8U) == 0xf0U) {
- utf_char = c & 0x07U;
+ } else if ((c & 0xf8) == 0xf0) {
+ utf_char = c & 0x07;
utf_cnt = 3;
- } else if ((c & 0xfcU) == 0xf8U) {
- utf_char = c & 0x03U;
+ } else if ((c & 0xfc) == 0xf8) {
+ utf_char = c & 0x03;
utf_cnt = 4;
- } else if ((c & 0xfeU) == 0xfcU) {
- utf_char = c & 0x01U;
+ } else if ((c & 0xfe) == 0xfc) {
+ utf_char = c & 0x01;
utf_cnt = 5;
} else {
- goto error_out;
+ utf_cnt = -1;
+ break;
}
continue;
} else {
@@ -219,92 +101,216 @@ try_again:
utf_char = c;
}
}
-
- /* Choose no compression if necessary */
- if (utf_char > max_val) {
- if (max_val == 0xffU) {
- max_val = 0xffffU;
- ocu[0] = (uint8_t)0x10U;
- goto try_again;
- }
- goto error_out;
- }
-
- if (max_val == 0xffffU)
- ocu[++u_len] = (uint8_t)(utf_char >> 8);
- ocu[++u_len] = (uint8_t)(utf_char & 0xffU);
+ *uni = utf_char;
+ break;
}
-
if (utf_cnt) {
-error_out:
- ocu[++u_len] = '?';
- printk(KERN_DEBUG pr_fmt("bad UTF-8 character\n"));
+ *uni = '?';
+ return -EINVAL;
}
+ return u_len;
+}
+
+#define ILLEGAL_CHAR_MARK '_'
+#define EXT_MARK '.'
+#define CRC_MARK '#'
+#define EXT_SIZE 5
+/* Number of chars we need to store generated CRC to make filename unique */
+#define CRC_LEN 5
- ocu[length - 1] = (uint8_t)u_len + 1;
+static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
+ int *str_o_idx,
+ const uint8_t *str_i, int str_i_max_len,
+ int *str_i_idx,
+ int u_ch, int *needsCRC,
+ int (*conv_f)(wchar_t, unsigned char *, int),
+ int translate)
+{
+ uint32_t c;
+ int illChar = 0;
+ int len, gotch = 0;
+
+ for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
+ if (*str_o_idx >= str_o_max_len) {
+ *needsCRC = 1;
+ return gotch;
+ }
- return u_len + 1;
+ /* Expand OSTA compressed Unicode to Unicode */
+ c = str_i[*str_i_idx];
+ if (u_ch > 1)
+ c = (c << 8) | str_i[*str_i_idx + 1];
+
+ if (translate && (c == '/' || c == 0))
+ illChar = 1;
+ else if (illChar)
+ break;
+ else
+ gotch = 1;
+ }
+ if (illChar) {
+ *needsCRC = 1;
+ c = ILLEGAL_CHAR_MARK;
+ gotch = 1;
+ }
+ if (gotch) {
+ len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
+ /* Valid character? */
+ if (len >= 0)
+ *str_o_idx += len;
+ else if (len == -ENAMETOOLONG) {
+ *needsCRC = 1;
+ gotch = 0;
+ } else {
+ str_o[(*str_o_idx)++] = '?';
+ *needsCRC = 1;
+ }
+ }
+ return gotch;
}
-static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
- const struct ustr *ocu_i)
+static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
+ const uint8_t *ocu, int ocu_len,
+ int (*conv_f)(wchar_t, unsigned char *, int),
+ int translate)
{
- const uint8_t *ocu;
- uint8_t cmp_id, ocu_len;
- int i, len;
+ uint32_t c;
+ uint8_t cmp_id;
+ int idx, len;
+ int u_ch;
+ int needsCRC = 0;
+ int ext_i_len, ext_max_len;
+ int str_o_len = 0; /* Length of resulting output */
+ int ext_o_len = 0; /* Extension output length */
+ int ext_crc_len = 0; /* Extension output length if used with CRC */
+ int i_ext = -1; /* Extension position in input buffer */
+ int o_crc = 0; /* Rightmost possible output pos for CRC+ext */
+ unsigned short valueCRC;
+ uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
+ uint8_t crc[CRC_LEN];
+ if (str_max_len <= 0)
+ return 0;
- ocu_len = ocu_i->u_len;
if (ocu_len == 0) {
- memset(utf_o, 0, sizeof(struct ustr));
+ memset(str_o, 0, str_max_len);
return 0;
}
- cmp_id = ocu_i->u_cmpID;
+ cmp_id = ocu[0];
if (cmp_id != 8 && cmp_id != 16) {
- memset(utf_o, 0, sizeof(struct ustr));
- pr_err("unknown compression code (%d) stri=%s\n",
- cmp_id, ocu_i->u_name);
+ memset(str_o, 0, str_max_len);
+ pr_err("unknown compression code (%d)\n", cmp_id);
return -EINVAL;
}
+ u_ch = cmp_id >> 3;
- ocu = ocu_i->u_name;
- utf_o->u_len = 0;
- for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
- /* Expand OSTA compressed Unicode to Unicode */
- uint32_t c = ocu[i++];
- if (cmp_id == 16)
- c = (c << 8) | ocu[i++];
+ ocu++;
+ ocu_len--;
- len = nls->uni2char(c, &utf_o->u_name[utf_o->u_len],
- UDF_NAME_LEN - utf_o->u_len);
- /* Valid character? */
- if (len >= 0)
- utf_o->u_len += len;
- else
- utf_o->u_name[utf_o->u_len++] = '?';
+ if (ocu_len % u_ch) {
+ pr_err("incorrect filename length (%d)\n", ocu_len + 1);
+ return -EINVAL;
+ }
+
+ if (translate) {
+ /* Look for extension */
+ for (idx = ocu_len - u_ch, ext_i_len = 0;
+ (idx >= 0) && (ext_i_len < EXT_SIZE);
+ idx -= u_ch, ext_i_len++) {
+ c = ocu[idx];
+ if (u_ch > 1)
+ c = (c << 8) | ocu[idx + 1];
+
+ if (c == EXT_MARK) {
+ if (ext_i_len)
+ i_ext = idx;
+ break;
+ }
+ }
+ if (i_ext >= 0) {
+ /* Convert extension */
+ ext_max_len = min_t(int, sizeof(ext), str_max_len);
+ ext[ext_o_len++] = EXT_MARK;
+ idx = i_ext + u_ch;
+ while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
+ ocu, ocu_len, &idx,
+ u_ch, &needsCRC,
+ conv_f, translate)) {
+ if ((ext_o_len + CRC_LEN) < str_max_len)
+ ext_crc_len = ext_o_len;
+ }
+ }
+ }
+
+ idx = 0;
+ while (1) {
+ if (translate && (idx == i_ext)) {
+ if (str_o_len > (str_max_len - ext_o_len))
+ needsCRC = 1;
+ break;
+ }
+
+ if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
+ ocu, ocu_len, &idx,
+ u_ch, &needsCRC, conv_f, translate))
+ break;
+
+ if (translate &&
+ (str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
+ o_crc = str_o_len;
+ }
+
+ if (translate) {
+ if (str_o_len <= 2 && str_o[0] == '.' &&
+ (str_o_len == 1 || str_o[1] == '.'))
+ needsCRC = 1;
+ if (needsCRC) {
+ str_o_len = o_crc;
+ valueCRC = crc_itu_t(0, ocu, ocu_len);
+ crc[0] = CRC_MARK;
+ crc[1] = hex_asc_upper_hi(valueCRC >> 8);
+ crc[2] = hex_asc_upper_lo(valueCRC >> 8);
+ crc[3] = hex_asc_upper_hi(valueCRC);
+ crc[4] = hex_asc_upper_lo(valueCRC);
+ len = min_t(int, CRC_LEN, str_max_len - str_o_len);
+ memcpy(&str_o[str_o_len], crc, len);
+ str_o_len += len;
+ ext_o_len = ext_crc_len;
+ }
+ if (ext_o_len > 0) {
+ memcpy(&str_o[str_o_len], ext, ext_o_len);
+ str_o_len += ext_o_len;
+ }
}
- utf_o->u_cmpID = 8;
- return utf_o->u_len;
+ return str_o_len;
}
-static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni,
- int length)
+static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
+ const uint8_t *str_i, int str_len,
+ int (*conv_f)(const unsigned char *, int, wchar_t *))
{
- int len;
- unsigned i, max_val;
- uint16_t uni_char;
- int u_len;
+ int i, len;
+ unsigned int max_val;
+ wchar_t uni_char;
+ int u_len, u_ch;
- memset(ocu, 0, sizeof(dstring) * length);
+ if (ocu_max_len <= 0)
+ return 0;
+
+ memset(ocu, 0, ocu_max_len);
ocu[0] = 8;
- max_val = 0xffU;
+ max_val = 0xff;
+ u_ch = 1;
try_again:
- u_len = 0U;
- for (i = 0U; i < uni->u_len; i++) {
- len = nls->char2uni(&uni->u_name[i], uni->u_len - i, &uni_char);
+ u_len = 1;
+ for (i = 0; i < str_len; i++) {
+ /* Name didn't fit? */
+ if (u_len + u_ch > ocu_max_len)
+ return 0;
+ len = conv_f(&str_i[i], str_len - i, &uni_char);
if (!len)
continue;
/* Invalid character, deal with it */
@@ -314,186 +320,65 @@ try_again:
}
if (uni_char > max_val) {
- max_val = 0xffffU;
- ocu[0] = (uint8_t)0x10U;
+ max_val = 0xffff;
+ ocu[0] = 0x10;
+ u_ch = 2;
goto try_again;
}
- if (max_val == 0xffffU)
- ocu[++u_len] = (uint8_t)(uni_char >> 8);
- ocu[++u_len] = (uint8_t)(uni_char & 0xffU);
+ if (max_val == 0xffff)
+ ocu[u_len++] = (uint8_t)(uni_char >> 8);
+ ocu[u_len++] = (uint8_t)(uni_char & 0xff);
i += len - 1;
}
- ocu[length - 1] = (uint8_t)u_len + 1;
- return u_len + 1;
+ return u_len;
+}
+
+int udf_CS0toUTF8(uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len)
+{
+ return udf_name_from_CS0(utf_o, o_len, ocu_i, i_len,
+ udf_uni2char_utf8, 0);
}
-int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
+int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
uint8_t *dname, int dlen)
{
- struct ustr *filename, *unifilename;
+ int (*conv_f)(wchar_t, unsigned char *, int);
int ret;
if (!slen)
return -EIO;
- filename = kmalloc(sizeof(struct ustr), GFP_NOFS);
- if (!filename)
- return -ENOMEM;
-
- unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS);
- if (!unifilename) {
- ret = -ENOMEM;
- goto out1;
- }
+ if (dlen <= 0)
+ return 0;
- udf_build_ustr_exact(unifilename, sname, slen);
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
- ret = udf_CS0toUTF8(filename, unifilename);
- if (ret < 0) {
- udf_debug("Failed in udf_get_filename: sname = %s\n",
- sname);
- goto out2;
- }
+ conv_f = udf_uni2char_utf8;
} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
- ret = udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename,
- unifilename);
- if (ret < 0) {
- udf_debug("Failed in udf_get_filename: sname = %s\n",
- sname);
- goto out2;
- }
+ conv_f = UDF_SB(sb)->s_nls_map->uni2char;
} else
BUG();
- ret = udf_translate_to_linux(dname, dlen,
- filename->u_name, filename->u_len,
- unifilename->u_name, unifilename->u_len);
+ ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
/* Zero length filename isn't valid... */
if (ret == 0)
ret = -EINVAL;
-out2:
- kfree(unifilename);
-out1:
- kfree(filename);
return ret;
}
-int udf_put_filename(struct super_block *sb, const uint8_t *sname,
- uint8_t *dname, int flen)
+int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
+ uint8_t *dname, int dlen)
{
- struct ustr unifilename;
- int namelen;
-
- if (!udf_char_to_ustr(&unifilename, sname, flen))
- return 0;
+ int (*conv_f)(const unsigned char *, int, wchar_t *);
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
- namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN);
- if (!namelen)
- return 0;
+ conv_f = udf_char2uni_utf8;
} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
- namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname,
- &unifilename, UDF_NAME_LEN);
- if (!namelen)
- return 0;
+ conv_f = UDF_SB(sb)->s_nls_map->char2uni;
} else
- return 0;
+ BUG();
- return namelen;
+ return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
}
-#define ILLEGAL_CHAR_MARK '_'
-#define EXT_MARK '.'
-#define CRC_MARK '#'
-#define EXT_SIZE 5
-/* Number of chars we need to store generated CRC to make filename unique */
-#define CRC_LEN 5
-
-static int udf_translate_to_linux(uint8_t *newName, int newLen,
- uint8_t *udfName, int udfLen,
- uint8_t *fidName, int fidNameLen)
-{
- int index, newIndex = 0, needsCRC = 0;
- int extIndex = 0, newExtIndex = 0, hasExt = 0;
- unsigned short valueCRC;
- uint8_t curr;
-
- if (udfName[0] == '.' &&
- (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
- needsCRC = 1;
- newIndex = udfLen;
- memcpy(newName, udfName, udfLen);
- } else {
- for (index = 0; index < udfLen; index++) {
- curr = udfName[index];
- if (curr == '/' || curr == 0) {
- needsCRC = 1;
- curr = ILLEGAL_CHAR_MARK;
- while (index + 1 < udfLen &&
- (udfName[index + 1] == '/' ||
- udfName[index + 1] == 0))
- index++;
- }
- if (curr == EXT_MARK &&
- (udfLen - index - 1) <= EXT_SIZE) {
- if (udfLen == index + 1)
- hasExt = 0;
- else {
- hasExt = 1;
- extIndex = index;
- newExtIndex = newIndex;
- }
- }
- if (newIndex < newLen)
- newName[newIndex++] = curr;
- else
- needsCRC = 1;
- }
- }
- if (needsCRC) {
- uint8_t ext[EXT_SIZE];
- int localExtIndex = 0;
-
- if (hasExt) {
- int maxFilenameLen;
- for (index = 0;
- index < EXT_SIZE && extIndex + index + 1 < udfLen;
- index++) {
- curr = udfName[extIndex + index + 1];
-
- if (curr == '/' || curr == 0) {
- needsCRC = 1;
- curr = ILLEGAL_CHAR_MARK;
- while (extIndex + index + 2 < udfLen &&
- (index + 1 < EXT_SIZE &&
- (udfName[extIndex + index + 2] == '/' ||
- udfName[extIndex + index + 2] == 0)))
- index++;
- }
- ext[localExtIndex++] = curr;
- }
- maxFilenameLen = newLen - CRC_LEN - localExtIndex;
- if (newIndex > maxFilenameLen)
- newIndex = maxFilenameLen;
- else
- newIndex = newExtIndex;
- } else if (newIndex > newLen - CRC_LEN)
- newIndex = newLen - CRC_LEN;
- newName[newIndex++] = CRC_MARK;
- valueCRC = crc_itu_t(0, fidName, fidNameLen);
- newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
- newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
- newName[newIndex++] = hex_asc_upper_hi(valueCRC);
- newName[newIndex++] = hex_asc_upper_lo(valueCRC);
-
- if (hasExt) {
- newName[newIndex++] = EXT_MARK;
- for (index = 0; index < localExtIndex; index++)
- newName[newIndex++] = ext[index];
- }
- }
-
- return newIndex;
-}
diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
index 392db25c0b56..ec4a6b49fa13 100644
--- a/fs/ufs/Makefile
+++ b/fs/ufs/Makefile
@@ -5,5 +5,5 @@
obj-$(CONFIG_UFS_FS) += ufs.o
ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
- namei.o super.o symlink.o util.o
+ namei.o super.o util.o
ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index dc5fae601c24..0447b949c7f5 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -237,7 +237,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
sector_t newb, struct page *locked_page)
{
const unsigned blks_per_page =
- 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ 1 << (PAGE_SHIFT - inode->i_blkbits);
const unsigned mask = blks_per_page - 1;
struct address_space * const mapping = inode->i_mapping;
pgoff_t index, cur_index, last_index;
@@ -255,9 +255,9 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
cur_index = locked_page->index;
end = count + beg;
- last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last_index = end >> (PAGE_SHIFT - inode->i_blkbits);
for (i = beg; i < end; i = (i | mask) + 1) {
- index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ index = i >> (PAGE_SHIFT - inode->i_blkbits);
if (likely(cur_index != index)) {
page = ufs_get_locked_page(mapping, index);
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 74f2e80288bf..0b1457292734 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -62,7 +62,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
static inline void ufs_put_page(struct page *page)
{
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
@@ -111,13 +111,13 @@ static void ufs_check_page(struct page *page)
struct super_block *sb = dir->i_sb;
char *kaddr = page_address(page);
unsigned offs, rec_len;
- unsigned limit = PAGE_CACHE_SIZE;
+ unsigned limit = PAGE_SIZE;
const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1;
struct ufs_dir_entry *p;
char *error;
- if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_CACHE_MASK;
+ if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+ limit = dir->i_size & ~PAGE_MASK;
if (limit & chunk_mask)
goto Ebadsize;
if (!limit)
@@ -170,7 +170,7 @@ Einumber:
bad_entry:
ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
"offset=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
rec_len, ufs_get_de_namlen(sb, p));
goto fail;
Eend:
@@ -178,7 +178,7 @@ Eend:
ufs_error(sb, __func__,
"entry in directory #%lu spans the page boundary"
"offset=%lu",
- dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
+ dir->i_ino, (page->index<<PAGE_SHIFT)+offs);
fail:
SetPageChecked(page);
SetPageError(page);
@@ -211,9 +211,9 @@ ufs_last_byte(struct inode *inode, unsigned long page_nr)
{
unsigned last_byte = inode->i_size;
- last_byte -= page_nr << PAGE_CACHE_SHIFT;
- if (last_byte > PAGE_CACHE_SIZE)
- last_byte = PAGE_CACHE_SIZE;
+ last_byte -= page_nr << PAGE_SHIFT;
+ if (last_byte > PAGE_SIZE)
+ last_byte = PAGE_SIZE;
return last_byte;
}
@@ -341,7 +341,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
kaddr = page_address(page);
dir_end = kaddr + ufs_last_byte(dir, n);
de = (struct ufs_dir_entry *)kaddr;
- kaddr += PAGE_CACHE_SIZE - reclen;
+ kaddr += PAGE_SIZE - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
/* We hit i_size */
@@ -432,8 +432,8 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
loff_t pos = ctx->pos;
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- unsigned int offset = pos & ~PAGE_CACHE_MASK;
- unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned int offset = pos & ~PAGE_MASK;
+ unsigned long n = pos >> PAGE_SHIFT;
unsigned long npages = dir_pages(inode);
unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
int need_revalidate = file->f_version != inode->i_version;
@@ -454,14 +454,14 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
ufs_error(sb, __func__,
"bad page in #%lu",
inode->i_ino);
- ctx->pos += PAGE_CACHE_SIZE - offset;
+ ctx->pos += PAGE_SIZE - offset;
return -EIO;
}
kaddr = page_address(page);
if (unlikely(need_revalidate)) {
if (offset) {
offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
- ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ ctx->pos = (n<<PAGE_SHIFT) + offset;
}
file->f_version = inode->i_version;
need_revalidate = 0;
@@ -574,7 +574,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
kmap(page);
base = (char*)page_address(page);
- memset(base, 0, PAGE_CACHE_SIZE);
+ memset(base, 0, PAGE_SIZE);
de = (struct ufs_dir_entry *) base;
@@ -594,7 +594,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
err = ufs_commit_chunk(page, 0, chunk_size);
fail:
- page_cache_release(page);
+ put_page(page);
return err;
}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index a064cf44b143..9f49431e798d 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -528,11 +528,12 @@ static void ufs_set_inode_ops(struct inode *inode)
inode->i_mapping->a_ops = &ufs_aops;
} else if (S_ISLNK(inode->i_mode)) {
if (!inode->i_blocks) {
- inode->i_op = &ufs_fast_symlink_inode_operations;
inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+ inode->i_op = &simple_symlink_inode_operations;
} else {
- inode->i_op = &ufs_symlink_inode_operations;
inode->i_mapping->a_ops = &ufs_aops;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
}
} else
init_special_inode(inode, inode->i_mode,
@@ -1050,13 +1051,13 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
lastfrag--;
lastpage = ufs_get_locked_page(mapping, lastfrag >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits));
+ (PAGE_SHIFT - inode->i_blkbits));
if (IS_ERR(lastpage)) {
err = -EIO;
goto out;
}
- end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
+ end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1);
bh = page_buffers(lastpage);
for (i = 0; i < end; ++i)
bh = bh->b_this_page;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 47966554317c..a1559f762805 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -123,14 +123,15 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
/* slow symlink */
- inode->i_op = &ufs_symlink_inode_operations;
+ inode->i_op = &page_symlink_inode_operations;
+ inode_nohighmem(inode);
inode->i_mapping->a_ops = &ufs_aops;
err = page_symlink(inode, symname, l);
if (err)
goto out_fail;
} else {
/* fast symlink */
- inode->i_op = &ufs_fast_symlink_inode_operations;
+ inode->i_op = &simple_symlink_inode_operations;
inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
memcpy(inode->i_link, symname, l);
inode->i_size = l-1;
@@ -304,7 +305,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0);
else {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
inode_dec_link_count(old_dir);
}
@@ -314,11 +315,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
out_dir:
if (dir_de) {
kunmap(dir_page);
- page_cache_release(dir_page);
+ put_page(dir_page);
}
out_old:
kunmap(old_page);
- page_cache_release(old_page);
+ put_page(old_page);
out:
return err;
}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f6390eec02ca..442fd52ebffe 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1427,7 +1427,7 @@ static int __init init_inodecache(void)
ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
sizeof(struct ufs_inode_info),
0, (SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (ufs_inode_cachep == NULL)
return -ENOMEM;
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
deleted file mode 100644
index 874480bb43e9..000000000000
--- a/fs/ufs/symlink.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * linux/fs/ufs/symlink.c
- *
- * Only fast symlinks left here - the rest is done by generic code. AV, 1999
- *
- * Copyright (C) 1998
- * Daniel Pirkl <daniel.pirkl@emai.cz>
- * Charles University, Faculty of Mathematics and Physics
- *
- * from
- *
- * linux/fs/ext2/symlink.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/symlink.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext2 symlink handling code
- */
-
-#include "ufs_fs.h"
-#include "ufs.h"
-
-const struct inode_operations ufs_fast_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = simple_follow_link,
- .setattr = ufs_setattr,
-};
-
-const struct inode_operations ufs_symlink_inode_operations = {
- .readlink = generic_readlink,
- .follow_link = page_follow_link_light,
- .put_link = page_put_link,
- .setattr = ufs_setattr,
-};
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 7da4aca868c0..c87f4c3fa9dd 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -136,10 +136,6 @@ extern __printf(3, 4)
void ufs_panic(struct super_block *, const char *, const char *, ...);
void ufs_mark_sb_dirty(struct super_block *sb);
-/* symlink.c */
-extern const struct inode_operations ufs_fast_symlink_inode_operations;
-extern const struct inode_operations ufs_symlink_inode_operations;
-
static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index b6c2f94e041e..a409e3e7827a 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -261,14 +261,14 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
if (unlikely(page->mapping == NULL)) {
/* Truncate got there first */
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
goto out;
}
if (!PageUptodate(page) || PageError(page)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
printk(KERN_ERR "ufs_change_blocknr: "
"can not read page: ino %lu, index: %lu\n",
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 954175928240..b7fbf53dbc81 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -283,7 +283,7 @@ extern struct page *ufs_get_locked_page(struct address_space *mapping,
static inline void ufs_put_locked_page(struct page *page)
{
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 50311703135b..66cdb44616d5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -287,6 +287,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
goto out;
/*
+ * We don't do userfault handling for the final child pid update.
+ */
+ if (current->flags & PF_EXITING)
+ goto out;
+
+ /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
diff --git a/fs/utimes.c b/fs/utimes.c
index aa138d64560a..85c40f4f373d 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -103,9 +103,9 @@ static int utimes_common(struct path *path, struct timespec *times)
}
}
retry_deleg:
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = notify_change(path->dentry, &newattrs, &delegated_inode);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
diff --git a/fs/xattr.c b/fs/xattr.c
index 072fee1258dd..4861322e28e8 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -129,7 +129,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (error)
return error;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = security_inode_setxattr(dentry, name, value, size, flags);
if (error)
goto out;
@@ -137,7 +137,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -208,25 +208,6 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
return error;
}
-/* Compare an extended attribute value with the given value */
-int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
- const char *value, size_t size, gfp_t flags)
-{
- char *xattr_value = NULL;
- int rc;
-
- rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
- if (rc < 0)
- return rc;
-
- if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
- rc = -EINVAL;
- else
- rc = 0;
- kfree(xattr_value);
- return rc;
-}
-
ssize_t
vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
{
@@ -296,7 +277,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
if (error)
return error;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
error = security_inode_removexattr(dentry, name);
if (error)
goto out;
@@ -309,7 +290,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
}
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
EXPORT_SYMBOL_GPL(vfs_removexattr);
@@ -324,7 +305,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
{
int error;
void *kvalue = NULL;
- void *vvalue = NULL; /* If non-NULL, we used vmalloc() */
char kname[XATTR_NAME_MAX + 1];
if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
@@ -341,10 +321,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
return -E2BIG;
kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!kvalue) {
- vvalue = vmalloc(size);
- if (!vvalue)
+ kvalue = vmalloc(size);
+ if (!kvalue)
return -ENOMEM;
- kvalue = vvalue;
}
if (copy_from_user(kvalue, value, size)) {
error = -EFAULT;
@@ -357,10 +336,8 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
error = vfs_setxattr(d, kname, kvalue, size, flags);
out:
- if (vvalue)
- vfree(vvalue);
- else
- kfree(kvalue);
+ kvfree(kvalue);
+
return error;
}
@@ -428,7 +405,6 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
{
ssize_t error;
void *kvalue = NULL;
- void *vvalue = NULL;
char kname[XATTR_NAME_MAX + 1];
error = strncpy_from_user(kname, name, sizeof(kname));
@@ -442,10 +418,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
size = XATTR_SIZE_MAX;
kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!kvalue) {
- vvalue = vmalloc(size);
- if (!vvalue)
+ kvalue = vmalloc(size);
+ if (!kvalue)
return -ENOMEM;
- kvalue = vvalue;
}
}
@@ -461,10 +436,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
than XATTR_SIZE_MAX bytes. Not possible. */
error = -E2BIG;
}
- if (vvalue)
- vfree(vvalue);
- else
- kfree(kvalue);
+
+ kvfree(kvalue);
+
return error;
}
@@ -521,17 +495,15 @@ listxattr(struct dentry *d, char __user *list, size_t size)
{
ssize_t error;
char *klist = NULL;
- char *vlist = NULL; /* If non-NULL, we used vmalloc() */
if (size) {
if (size > XATTR_LIST_MAX)
size = XATTR_LIST_MAX;
klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
if (!klist) {
- vlist = vmalloc(size);
- if (!vlist)
+ klist = vmalloc(size);
+ if (!klist)
return -ENOMEM;
- klist = vlist;
}
}
@@ -544,10 +516,9 @@ listxattr(struct dentry *d, char __user *list, size_t size)
than XATTR_LIST_MAX bytes. Not possible. */
error = -E2BIG;
}
- if (vlist)
- vfree(vlist);
- else
- kfree(klist);
+
+ kvfree(klist);
+
return error;
}
@@ -700,13 +671,20 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
return NULL;
for_each_xattr_handler(handlers, handler) {
- const char *n = strcmp_prefix(*name, handler->prefix);
+ const char *n;
+
+ n = strcmp_prefix(*name, xattr_prefix(handler));
if (n) {
+ if (!handler->prefix ^ !*n) {
+ if (*n)
+ continue;
+ return ERR_PTR(-EINVAL);
+ }
*name = n;
- break;
+ return handler;
}
}
- return handler;
+ return ERR_PTR(-EOPNOTSUPP);
}
/*
@@ -718,9 +696,9 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
- return handler->get(dentry, name, buffer, size, handler->flags);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ return handler->get(handler, dentry, name, buffer, size);
}
/*
@@ -735,19 +713,25 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (!buffer) {
for_each_xattr_handler(handlers, handler) {
- size += handler->list(dentry, NULL, 0, NULL, 0,
- handler->flags);
+ if (!handler->name ||
+ (handler->list && !handler->list(dentry)))
+ continue;
+ size += strlen(handler->name) + 1;
}
} else {
char *buf = buffer;
+ size_t len;
for_each_xattr_handler(handlers, handler) {
- size = handler->list(dentry, buf, buffer_size,
- NULL, 0, handler->flags);
- if (size > buffer_size)
+ if (!handler->name ||
+ (handler->list && !handler->list(dentry)))
+ continue;
+ len = strlen(handler->name);
+ if (len + 1 > buffer_size)
return -ERANGE;
- buf += size;
- buffer_size -= size;
+ memcpy(buf, handler->name, len + 1);
+ buf += len + 1;
+ buffer_size -= len + 1;
}
size = buf - buffer;
}
@@ -765,9 +749,9 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
if (size == 0)
value = ""; /* empty EA, do not remove */
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
- return handler->set(dentry, name, value, size, flags, handler->flags);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ return handler->set(handler, dentry, name, value, size, flags);
}
/*
@@ -780,10 +764,9 @@ generic_removexattr(struct dentry *dentry, const char *name)
const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
- if (!handler)
- return -EOPNOTSUPP;
- return handler->set(dentry, name, NULL, 0,
- XATTR_REPLACE, handler->flags);
+ if (IS_ERR(handler))
+ return PTR_ERR(handler);
+ return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
}
EXPORT_SYMBOL(generic_getxattr);
@@ -791,6 +774,30 @@ EXPORT_SYMBOL(generic_listxattr);
EXPORT_SYMBOL(generic_setxattr);
EXPORT_SYMBOL(generic_removexattr);
+/**
+ * xattr_full_name - Compute full attribute name from suffix
+ *
+ * @handler: handler of the xattr_handler operation
+ * @name: name passed to the xattr_handler operation
+ *
+ * The get and set xattr handler operations are called with the remainder of
+ * the attribute name after skipping the handler's prefix: for example, "foo"
+ * is passed to the get operation of a handler with prefix "user." to get
+ * attribute "user.foo". The full name is still "there" in the name though.
+ *
+ * Note: the list xattr handler operation when called from the vfs is passed a
+ * NULL name; some file systems use this operation internally, with varying
+ * semantics.
+ */
+const char *xattr_full_name(const struct xattr_handler *handler,
+ const char *name)
+{
+ size_t prefix_len = strlen(xattr_prefix(handler));
+
+ return name - prefix_len;
+}
+EXPORT_SYMBOL(xattr_full_name);
+
/*
* Allocate new xattr and copy in the value; but leave the name to callers.
*/
@@ -840,8 +847,22 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
return ret;
}
-static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags)
+/**
+ * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
+ * @xattrs: target simple_xattr list
+ * @name: name of the extended attribute
+ * @value: value of the xattr. If %NULL, will remove the attribute.
+ * @size: size of the new xattr
+ * @flags: %XATTR_{CREATE|REPLACE}
+ *
+ * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
+ * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
+ * otherwise, fails with -ENODATA.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
+ const void *value, size_t size, int flags)
{
struct simple_xattr *xattr;
struct simple_xattr *new_xattr = NULL;
@@ -891,73 +912,64 @@ out:
}
-/**
- * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
- * @xattrs: target simple_xattr list
- * @name: name of the new extended attribute
- * @value: value of the new xattr. If %NULL, will remove the attribute
- * @size: size of the new xattr
- * @flags: %XATTR_{CREATE|REPLACE}
- *
- * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
- * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
- * otherwise, fails with -ENODATA.
- *
- * Returns 0 on success, -errno on failure.
- */
-int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags)
-{
- if (size == 0)
- value = ""; /* empty EA, do not remove */
- return __simple_xattr_set(xattrs, name, value, size, flags);
-}
-
-/*
- * xattr REMOVE operation for in-memory/pseudo filesystems
- */
-int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name)
+static bool xattr_is_trusted(const char *name)
{
- return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE);
+ return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
-static bool xattr_is_trusted(const char *name)
+static int xattr_list_one(char **buffer, ssize_t *remaining_size,
+ const char *name)
{
- return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+ size_t len = strlen(name) + 1;
+ if (*buffer) {
+ if (*remaining_size < len)
+ return -ERANGE;
+ memcpy(*buffer, name, len);
+ *buffer += len;
+ }
+ *remaining_size -= len;
+ return 0;
}
/*
* xattr LIST operation for in-memory/pseudo filesystems
*/
-ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
- size_t size)
+ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
+ char *buffer, size_t size)
{
bool trusted = capable(CAP_SYS_ADMIN);
struct simple_xattr *xattr;
- size_t used = 0;
+ ssize_t remaining_size = size;
+ int err = 0;
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (inode->i_acl) {
+ err = xattr_list_one(&buffer, &remaining_size,
+ XATTR_NAME_POSIX_ACL_ACCESS);
+ if (err)
+ return err;
+ }
+ if (inode->i_default_acl) {
+ err = xattr_list_one(&buffer, &remaining_size,
+ XATTR_NAME_POSIX_ACL_DEFAULT);
+ if (err)
+ return err;
+ }
+#endif
spin_lock(&xattrs->lock);
list_for_each_entry(xattr, &xattrs->head, list) {
- size_t len;
-
/* skip "trusted." attributes for unprivileged callers */
if (!trusted && xattr_is_trusted(xattr->name))
continue;
- len = strlen(xattr->name) + 1;
- used += len;
- if (buffer) {
- if (size < used) {
- used = -ERANGE;
- break;
- }
- memcpy(buffer, xattr->name, len);
- buffer += len;
- }
+ err = xattr_list_one(&buffer, &remaining_size, xattr->name);
+ if (err)
+ break;
}
spin_unlock(&xattrs->lock);
- return used;
+ return err ? err : size - remaining_size;
}
/*
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index a096841bd06c..3542d94fddce 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \
xfs_message.o \
xfs_mount.o \
xfs_mru_cache.o \
+ xfs_stats.o \
xfs_super.o \
xfs_symlink.o \
xfs_sysfs.o \
@@ -118,7 +119,7 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
-xfs-$(CONFIG_PROC_FS) += xfs_stats.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
-xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o
+xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o
+xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a7a3a63bb360..686ba6fb20dd 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -55,8 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
- __func__, lflags);
+ "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
+ current->comm, current->pid,
+ (unsigned int)size, __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
@@ -120,8 +121,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
- __func__, lflags);
+ "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+ current->comm, current->pid,
+ __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index cc6b768fc068..d1c66e465ca5 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN
#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT
#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
+#define KM_ZONE_ACCOUNT SLAB_ACCOUNT
#define kmem_zone kmem_cache
#define kmem_zone_t struct kmem_cache
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ffad7f20342f..a708e38b494c 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -482,7 +482,9 @@ xfs_agfl_verify(
be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
return false;
}
- return true;
+
+ return xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn));
}
static void
@@ -533,6 +535,7 @@ xfs_agfl_write_verify(
}
const struct xfs_buf_ops xfs_agfl_buf_ops = {
+ .name = "xfs_agfl",
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
};
@@ -651,8 +654,8 @@ xfs_alloc_ag_vextent(
-((long)(args->len)));
}
- XFS_STATS_INC(xs_allocx);
- XFS_STATS_ADD(xs_allocb, args->len);
+ XFS_STATS_INC(args->mp, xs_allocx);
+ XFS_STATS_ADD(args->mp, xs_allocb, args->len);
return error;
}
@@ -1808,8 +1811,8 @@ xfs_free_ag_extent(
if (!isfl)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
- XFS_STATS_INC(xs_freex);
- XFS_STATS_ADD(xs_freeb, len);
+ XFS_STATS_INC(mp, xs_freex);
+ XFS_STATS_ADD(mp, xs_freeb, len);
trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
@@ -1924,7 +1927,7 @@ xfs_alloc_space_available(
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
-STATIC int /* error */
+int /* error */
xfs_alloc_fix_freelist(
struct xfs_alloc_arg *args, /* allocation argument structure */
int flags) /* XFS_ALLOC_FLAG_... */
@@ -2259,9 +2262,13 @@ xfs_agf_verify(
{
struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+ return false;
+ if (!xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
return false;
+ }
if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
@@ -2333,6 +2340,7 @@ xfs_agf_write_verify(
}
const struct xfs_buf_ops xfs_agf_buf_ops = {
+ .name = "xfs_agf",
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
};
@@ -2503,7 +2511,7 @@ xfs_alloc_vextent(
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
- if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) &&
+ if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
(mp->m_flags & XFS_MOUNT_32BITINODES)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
@@ -2634,6 +2642,14 @@ xfs_alloc_vextent(
XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
args->len);
#endif
+
+ /* Zero the extent if we were asked to do so */
+ if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ error = xfs_zero_extent(args->ip, args->fsbno, args->len);
+ if (error)
+ goto error0;
+ }
+
}
xfs_perag_put(args->pag);
return 0;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index ca1c8168373a..135eb3d24db7 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg {
struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for a.g. freelist header */
struct xfs_perag *pag; /* per-ag struct for this agno */
+ struct xfs_inode *ip; /* for userdata zeroing method */
xfs_fsblock_t fsbno; /* file system block number */
xfs_agnumber_t agno; /* allocation group number */
xfs_agblock_t agbno; /* allocation group-relative block # */
@@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg {
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
char isfl; /* set if is freelist blocks - !acctg */
- char userdata; /* set if this is user data */
+ char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
} xfs_alloc_arg_t;
/*
* Defines for userdata
*/
-#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
-#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
+#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag, xfs_extlen_t need);
@@ -233,5 +235,6 @@ xfs_alloc_get_rec(
int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 90de071dd4c2..d9b42425291e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -118,8 +118,6 @@ xfs_allocbt_free_block(
xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
-
- xfs_trans_binval(cur->bc_tp, bp);
return 0;
}
@@ -293,14 +291,7 @@ xfs_allocbt_verify(
level = be16_to_cpu(block->bb_level);
switch (block->bb_magic) {
case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
- return false;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
- return false;
- if (pag &&
- be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
return false;
/* fall through */
case cpu_to_be32(XFS_ABTB_MAGIC):
@@ -311,14 +302,7 @@ xfs_allocbt_verify(
return false;
break;
case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
- return false;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
- return false;
- if (pag &&
- be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
return false;
/* fall through */
case cpu_to_be32(XFS_ABTC_MAGIC):
@@ -332,21 +316,7 @@ xfs_allocbt_verify(
return false;
}
- /* numrecs verification */
- if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
- return false;
-
- /* sibling pointer verification */
- if (!block->bb_u.s.bb_leftsib ||
- (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
- if (!block->bb_u.s.bb_rightsib ||
- (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
-
- return true;
+ return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
static void
@@ -379,6 +349,7 @@ xfs_allocbt_write_verify(
}
const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+ .name = "xfs_allocbt",
.verify_read = xfs_allocbt_read_verify,
.verify_write = xfs_allocbt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index ff065578969f..fa3b948ef9c2 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -125,7 +125,7 @@ xfs_attr_get(
uint lock_mode;
int error;
- XFS_STATS_INC(xs_attr_get);
+ XFS_STATS_INC(ip->i_mount, xs_attr_get);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
@@ -207,9 +207,9 @@ xfs_attr_set(
struct xfs_trans_res tres;
xfs_fsblock_t firstblock;
int rsvd = (flags & ATTR_ROOT) != 0;
- int error, err2, committed, local;
+ int error, err2, local;
- XFS_STATS_INC(xs_attr_set);
+ XFS_STATS_INC(mp, xs_attr_set);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
@@ -334,25 +334,15 @@ xfs_attr_set(
*/
xfs_bmap_init(args.flist, args.firstblock);
error = xfs_attr_shortform_to_leaf(&args);
- if (!error) {
- error = xfs_bmap_finish(&args.trans, args.flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args.trans, args.flist, dp);
if (error) {
- ASSERT(committed);
args.trans = NULL;
xfs_bmap_cancel(&flist);
goto out;
}
/*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args.trans, dp, 0);
-
- /*
* Commit the leaf transformation. We'll need another (linked)
* transaction to add the new attribute to the leaf.
*/
@@ -412,7 +402,7 @@ xfs_attr_remove(
xfs_fsblock_t firstblock;
int error;
- XFS_STATS_INC(xs_attr_remove);
+ XFS_STATS_INC(mp, xs_attr_remove);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
@@ -568,7 +558,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
{
xfs_inode_t *dp;
struct xfs_buf *bp;
- int retval, error, committed, forkoff;
+ int retval, error, forkoff;
trace_xfs_attr_leaf_addname(args);
@@ -628,25 +618,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
/*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
- /*
* Commit the current trans (including the inode) and start
* a new one.
*/
@@ -729,25 +709,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
-
- /*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
}
/*
@@ -775,7 +744,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
{
xfs_inode_t *dp;
struct xfs_buf *bp;
- int error, committed, forkoff;
+ int error, forkoff;
trace_xfs_attr_leaf_removename(args);
@@ -803,23 +772,13 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
}
return 0;
}
@@ -877,7 +836,7 @@ xfs_attr_node_addname(xfs_da_args_t *args)
xfs_da_state_blk_t *blk;
xfs_inode_t *dp;
xfs_mount_t *mp;
- int committed, retval, error;
+ int retval, error;
trace_xfs_attr_node_addname(args);
@@ -938,27 +897,16 @@ restart:
state = NULL;
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_node(args);
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
/*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
- /*
* Commit the node conversion and start the next
* trans in the chain.
*/
@@ -977,23 +925,13 @@ restart:
*/
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_da3_split(state);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -1086,25 +1024,14 @@ restart:
if (retval && (state->path.active > 1)) {
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_da3_join(state);
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
}
/*
@@ -1146,7 +1073,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_da_state_blk_t *blk;
xfs_inode_t *dp;
struct xfs_buf *bp;
- int retval, error, committed, forkoff;
+ int retval, error, forkoff;
trace_xfs_attr_node_removename(args);
@@ -1220,24 +1147,13 @@ xfs_attr_node_removename(xfs_da_args_t *args)
if (retval && (state->path.active > 1)) {
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_da3_join(state);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
/*
* Commit the Btree join operation and start a new trans.
*/
@@ -1265,25 +1181,14 @@ xfs_attr_node_removename(xfs_da_args_t *args)
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans,
- args->flist,
- &committed);
- }
+ args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
goto out;
}
-
- /*
- * bmap_finish() may have committed the last trans
- * and started a new one. We need the inode to be
- * in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
} else
xfs_trans_brelse(args->trans, bp);
}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 33df52d97ec7..01a5ecfedfcf 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -41,6 +41,7 @@
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
#include "xfs_dir2.h"
+#include "xfs_log.h"
/*
@@ -266,6 +267,8 @@ xfs_attr3_leaf_verify(
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+ return false;
} else {
if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
return false;
@@ -325,6 +328,7 @@ xfs_attr3_leaf_read_verify(
}
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+ .name = "xfs_attr3_leaf",
.verify_read = xfs_attr3_leaf_read_verify,
.verify_write = xfs_attr3_leaf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index f38f9bd81557..a572532a55cd 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -107,7 +107,7 @@ xfs_attr3_rmt_verify(
if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
return false;
if (be32_to_cpu(rmt->rm_offset) +
- be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+ be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX)
return false;
if (rmt->rm_owner == 0)
return false;
@@ -201,6 +201,7 @@ xfs_attr3_rmt_write_verify(
}
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+ .name = "xfs_attr3_rmt",
.verify_read = xfs_attr3_rmt_read_verify,
.verify_write = xfs_attr3_rmt_write_verify,
};
@@ -447,8 +448,6 @@ xfs_attr_rmtval_set(
* Roll through the "value", allocating blocks on disk as required.
*/
while (blkcnt > 0) {
- int committed;
-
/*
* Allocate a single extent, up to the size of the value.
*
@@ -466,24 +465,14 @@ xfs_attr_rmtval_set(
error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
args->total, &map, &nmap, args->flist);
- if (!error) {
- error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ if (!error)
+ error = xfs_bmap_finish(&args->trans, args->flist, dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
- /*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, dp, 0);
-
ASSERT(nmap == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK));
@@ -614,31 +603,20 @@ xfs_attr_rmtval_remove(
blkcnt = args->rmtblkcnt;
done = 0;
while (!done) {
- int committed;
-
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
XFS_BMAPI_ATTRFORK, 1, args->firstblock,
args->flist, &done);
- if (!error) {
+ if (!error)
error = xfs_bmap_finish(&args->trans, args->flist,
- &committed);
- }
+ args->dp);
if (error) {
- ASSERT(committed);
args->trans = NULL;
xfs_bmap_cancel(args->flist);
return error;
}
/*
- * bmap_finish() may have committed the last trans and started
- * a new one. We need the inode to be in all transactions.
- */
- if (committed)
- xfs_trans_ijoin(args->trans, args->dp, 0);
-
- /*
* Close out trans and start the next one in the chain.
*/
error = xfs_trans_roll(&args->trans, args->dp);
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 919756e3ba53..90928bbe693c 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -24,22 +24,6 @@
* Small attribute lists are packed as tightly as possible so as
* to fit into the literal area of the inode.
*/
-
-/*
- * Entries are packed toward the top as tight as possible.
- */
-typedef struct xfs_attr_shortform {
- struct xfs_attr_sf_hdr { /* constant-structure header block */
- __be16 totsize; /* total bytes in shortform list */
- __u8 count; /* count of active entries */
- } hdr;
- struct xfs_attr_sf_entry {
- __uint8_t namelen; /* actual length of name (no NULL) */
- __uint8_t valuelen; /* actual length of value (no NULL) */
- __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
- __uint8_t nameval[1]; /* name & value bytes concatenated */
- } list[1]; /* variable sized array */
-} xfs_attr_shortform_t;
typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
index 0e8885a59646..0a94cce5ea35 100644
--- a/fs/xfs/libxfs/xfs_bit.c
+++ b/fs/xfs/libxfs/xfs_bit.c
@@ -32,13 +32,13 @@ int
xfs_bitmap_empty(uint *map, uint size)
{
uint i;
- uint ret = 0;
for (i = 0; i < size; i++) {
- ret |= map[i];
+ if (map[i] != 0)
+ return 0;
}
- return (ret == 0);
+ return 1;
}
/*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 8e2010d53b07..ce41d7fe753c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -325,9 +325,11 @@ xfs_check_block(
/*
* Check that the extents for the inode ip are in the right order in all
- * btree leaves.
+ * btree leaves. THis becomes prohibitively expensive for large extent count
+ * files, so don't bother with inodes that have more than 10,000 extents in
+ * them. The btree record ordering checks will still be done, so for such large
+ * bmapbt constructs that is going to catch most corruptions.
*/
-
STATIC void
xfs_bmap_check_leaf_extents(
xfs_btree_cur_t *cur, /* btree cursor or null */
@@ -352,6 +354,10 @@ xfs_bmap_check_leaf_extents(
return;
}
+ /* skip large extent count inodes */
+ if (ip->i_d.di_nextents > 10000)
+ return;
+
bno = NULLFSBLOCK;
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -471,10 +477,7 @@ xfs_bmap_check_leaf_extents(
}
block = XFS_BUF_TO_BLOCK(bp);
}
- if (bp_release) {
- bp_release = 0;
- xfs_trans_brelse(NULL, bp);
- }
+
return;
error0:
@@ -906,7 +909,7 @@ xfs_bmap_local_to_extents(
* We don't want to deal with the case of keeping inode data inline yet.
* So sending the data fork of a regular inode is invalid.
*/
- ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK));
+ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK));
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
@@ -948,14 +951,16 @@ xfs_bmap_local_to_extents(
bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
/*
- * Initialise the block and copy the data
+ * Initialize the block, copy the data and log the remote buffer.
*
- * Note: init_fn must set the buffer log item type correctly!
+ * The callout is responsible for logging because the remote format
+ * might differ from the local format and thus we don't know how much to
+ * log here. Note that init_fn must also set the buffer log item type
+ * correctly.
*/
init_fn(tp, bp, ip, ifp);
- /* account for the change in fork size and log everything */
- xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ /* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
xfs_bmap_local_to_extents_empty(ip, whichfork);
flags |= XFS_ILOG_CORE;
@@ -1071,7 +1076,7 @@ xfs_bmap_add_attrfork_local(
if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
return 0;
- if (S_ISDIR(ip->i_d.di_mode)) {
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
memset(&dargs, 0, sizeof(dargs));
dargs.geo = ip->i_mount->m_dir_geo;
dargs.dp = ip;
@@ -1083,7 +1088,7 @@ xfs_bmap_add_attrfork_local(
return xfs_dir2_sf_to_block(&dargs);
}
- if (S_ISLNK(ip->i_d.di_mode))
+ if (S_ISLNK(VFS_I(ip)->i_mode))
return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
flags, XFS_DATA_FORK,
xfs_symlink_local_to_remote);
@@ -1109,7 +1114,6 @@ xfs_bmap_add_attrfork(
xfs_trans_t *tp; /* transaction pointer */
int blks; /* space reservation */
int version = 1; /* superblock attr version */
- int committed; /* xaction was committed */
int logflags; /* logging flags */
int error; /* error return value */
@@ -1212,7 +1216,7 @@ xfs_bmap_add_attrfork(
xfs_log_sb(tp);
}
- error = xfs_bmap_finish(&tp, &flist, &committed);
+ error = xfs_bmap_finish(&tp, &flist, NULL);
if (error)
goto bmap_cancel;
error = xfs_trans_commit(tp);
@@ -1435,7 +1439,7 @@ xfs_bmap_search_extents(
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- XFS_STATS_INC(xs_look_exlist);
+ XFS_STATS_INC(ip->i_mount, xs_look_exlist);
ifp = XFS_IFORK_PTR(ip, fork);
ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
@@ -1721,10 +1725,11 @@ xfs_bmap_add_extent_delay_real(
xfs_filblks_t temp=0; /* value for da_new calculations */
xfs_filblks_t temp2=0;/* value for da_new calculations */
int tmp_rval; /* partial logging flags */
+ int whichfork = XFS_DATA_FORK;
struct xfs_mount *mp;
- mp = bma->tp ? bma->tp->t_mountp : NULL;
- ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+ mp = bma->ip->i_mount;
+ ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(bma->idx >= 0);
ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
@@ -1732,7 +1737,7 @@ xfs_bmap_add_extent_delay_real(
ASSERT(!bma->cur ||
(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
@@ -1783,7 +1788,7 @@ xfs_bmap_add_extent_delay_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
state |= BMAP_RIGHT_VALID;
xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
@@ -2014,10 +2019,10 @@ xfs_bmap_add_extent_delay_real(
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist,
- &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
+ &bma->cur, 1, &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
goto done;
@@ -2098,10 +2103,10 @@ xfs_bmap_add_extent_delay_real(
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur, 1,
- &tmp_rval, XFS_DATA_FORK);
+ &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
goto done;
@@ -2167,10 +2172,10 @@ xfs_bmap_add_extent_delay_real(
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
}
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur,
- 1, &tmp_rval, XFS_DATA_FORK);
+ 1, &tmp_rval, whichfork);
rval |= tmp_rval;
if (error)
goto done;
@@ -2213,13 +2218,13 @@ xfs_bmap_add_extent_delay_real(
}
/* convert to a btree if necessary */
- if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur,
- da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+ da_old > 0, &tmp_logflags, whichfork);
bma->logflags |= tmp_logflags;
if (error)
goto done;
@@ -2240,7 +2245,7 @@ xfs_bmap_add_extent_delay_real(
if (bma->cur)
bma->cur->bc_private.b.allocated = 0;
- xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
+ xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done:
bma->logflags |= rval;
return error;
@@ -2286,7 +2291,7 @@ xfs_bmap_add_extent_unwritten_real(
ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
ASSERT(!isnullstartblock(new->br_startblock));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
@@ -2937,7 +2942,7 @@ xfs_bmap_add_extent_hole_real(
int state; /* state bits, accessed thru macros */
struct xfs_mount *mp;
- mp = bma->tp ? bma->tp->t_mountp : NULL;
+ mp = bma->ip->i_mount;
ifp = XFS_IFORK_PTR(bma->ip, whichfork);
ASSERT(bma->idx >= 0);
@@ -2946,7 +2951,7 @@ xfs_bmap_add_extent_hole_real(
ASSERT(!bma->cur ||
!(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
state = 0;
if (whichfork == XFS_ATTR_FORK)
@@ -3737,11 +3742,11 @@ xfs_bmap_btalloc(
args.prod = align;
if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
args.mod = (xfs_extlen_t)(args.prod - args.mod);
- } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
+ } else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
args.prod = 1;
args.mod = 0;
} else {
- args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
+ args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
args.mod = (xfs_extlen_t)(args.prod - args.mod);
}
@@ -3800,8 +3805,13 @@ xfs_bmap_btalloc(
args.wasdel = ap->wasdel;
args.isfl = 0;
args.userdata = ap->userdata;
- if ((error = xfs_alloc_vextent(&args)))
+ if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+ args.ip = ap->ip;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
return error;
+
if (tryagain && args.fsbno == NULLFSBLOCK) {
/*
* Exact allocation failed. Now try with alignment
@@ -4036,7 +4046,7 @@ xfs_bmapi_read(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- XFS_STATS_INC(xs_blk_mapr);
+ XFS_STATS_INC(mp, xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -4221,7 +4231,7 @@ xfs_bmapi_delay(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- XFS_STATS_INC(xs_blk_mapw);
+ XFS_STATS_INC(mp, xs_blk_mapw);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
@@ -4300,11 +4310,14 @@ xfs_bmapi_allocate(
/*
* Indicate if this is the first user data in the file, or just any
- * user data.
+ * user data. And if it is userdata, indicate whether it needs to
+ * be initialised to zero during allocation.
*/
if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->userdata = (bma->offset == 0) ?
XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+ if (bma->flags & XFS_BMAPI_ZERO)
+ bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
}
bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4419,6 +4432,17 @@ xfs_bmapi_convert_unwritten(
mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+ /*
+ * Before insertion into the bmbt, zero the range being converted
+ * if required.
+ */
+ if (flags & XFS_BMAPI_ZERO) {
+ error = xfs_zero_extent(bma->ip, mval->br_startblock,
+ mval->br_blockcount);
+ if (error)
+ return error;
+ }
+
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
&bma->cur, mval, bma->firstblock, bma->flist,
&tmp_logflags);
@@ -4512,6 +4536,18 @@ xfs_bmapi_write(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ /* zeroing is for currently only for data extents, not metadata */
+ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
+ /*
+ * we can allocate unwritten extents or pre-zero allocated blocks,
+ * but it makes no sense to do both at once. This would result in
+ * zeroing the unwritten extent twice, but it still being an
+ * unwritten extent....
+ */
+ ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
+
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
@@ -4525,7 +4561,7 @@ xfs_bmapi_write(
ifp = XFS_IFORK_PTR(ip, whichfork);
- XFS_STATS_INC(xs_blk_mapw);
+ XFS_STATS_INC(mp, xs_blk_mapw);
if (*firstblock == NULLFSBLOCK) {
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
@@ -4682,6 +4718,66 @@ error0:
}
/*
+ * When a delalloc extent is split (e.g., due to a hole punch), the original
+ * indlen reservation must be shared across the two new extents that are left
+ * behind.
+ *
+ * Given the original reservation and the worst case indlen for the two new
+ * extents (as calculated by xfs_bmap_worst_indlen()), split the original
+ * reservation fairly across the two new extents. If necessary, steal available
+ * blocks from a deleted extent to make up a reservation deficiency (e.g., if
+ * ores == 1). The number of stolen blocks is returned. The availability and
+ * subsequent accounting of stolen blocks is the responsibility of the caller.
+ */
+static xfs_filblks_t
+xfs_bmap_split_indlen(
+ xfs_filblks_t ores, /* original res. */
+ xfs_filblks_t *indlen1, /* ext1 worst indlen */
+ xfs_filblks_t *indlen2, /* ext2 worst indlen */
+ xfs_filblks_t avail) /* stealable blocks */
+{
+ xfs_filblks_t len1 = *indlen1;
+ xfs_filblks_t len2 = *indlen2;
+ xfs_filblks_t nres = len1 + len2; /* new total res. */
+ xfs_filblks_t stolen = 0;
+
+ /*
+ * Steal as many blocks as we can to try and satisfy the worst case
+ * indlen for both new extents.
+ */
+ while (nres > ores && avail) {
+ nres--;
+ avail--;
+ stolen++;
+ }
+
+ /*
+ * The only blocks available are those reserved for the original
+ * extent and what we can steal from the extent being removed.
+ * If this still isn't enough to satisfy the combined
+ * requirements for the two new extents, skim blocks off of each
+ * of the new reservations until they match what is available.
+ */
+ while (nres > ores) {
+ if (len1) {
+ len1--;
+ nres--;
+ }
+ if (nres == ores)
+ break;
+ if (len2) {
+ len2--;
+ nres--;
+ }
+ }
+
+ *indlen1 = len1;
+ *indlen2 = len2;
+
+ return stolen;
+}
+
+/*
* Called by xfs_bmapi to update file extent records and the btree
* after removing space (or undoing a delayed allocation).
*/
@@ -4718,12 +4814,12 @@ xfs_bmap_del_extent(
xfs_filblks_t temp2; /* for indirect length calculations */
int state = 0;
- XFS_STATS_INC(xs_del_exlist);
+ mp = ip->i_mount;
+ XFS_STATS_INC(mp, xs_del_exlist);
if (whichfork == XFS_ATTR_FORK)
state |= BMAP_ATTRFORK;
- mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
(uint)sizeof(xfs_bmbt_rec_t)));
@@ -4945,28 +5041,29 @@ xfs_bmap_del_extent(
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
} else {
+ xfs_filblks_t stolen;
ASSERT(whichfork == XFS_DATA_FORK);
- temp = xfs_bmap_worst_indlen(ip, temp);
+
+ /*
+ * Distribute the original indlen reservation across the
+ * two new extents. Steal blocks from the deleted extent
+ * if necessary. Stealing blocks simply fudges the
+ * fdblocks accounting in xfs_bunmapi().
+ */
+ temp = xfs_bmap_worst_indlen(ip, got.br_blockcount);
+ temp2 = xfs_bmap_worst_indlen(ip, new.br_blockcount);
+ stolen = xfs_bmap_split_indlen(da_old, &temp, &temp2,
+ del->br_blockcount);
+ da_new = temp + temp2 - stolen;
+ del->br_blockcount -= stolen;
+
+ /*
+ * Set the reservation for each extent. Warn if either
+ * is zero as this can lead to delalloc problems.
+ */
+ WARN_ON_ONCE(!temp || !temp2);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- temp2 = xfs_bmap_worst_indlen(ip, temp2);
new.br_startblock = nullstartblock((int)temp2);
- da_new = temp + temp2;
- while (da_new > da_old) {
- if (temp) {
- temp--;
- da_new--;
- xfs_bmbt_set_startblock(ep,
- nullstartblock((int)temp));
- }
- if (da_new == da_old)
- break;
- if (temp2) {
- temp2--;
- da_new--;
- new.br_startblock =
- nullstartblock((int)temp2);
- }
- }
}
trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
xfs_iext_insert(ip, *idx + 1, 1, &new, state);
@@ -5070,7 +5167,7 @@ xfs_bunmapi(
*done = 1;
return 0;
}
- XFS_STATS_INC(xs_blk_unmap);
+ XFS_STATS_INC(mp, xs_blk_unmap);
isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
start = bno;
bno = start + len - 1;
@@ -5171,7 +5268,7 @@ xfs_bunmapi(
* This is better than zeroing it.
*/
ASSERT(del.br_state == XFS_EXT_NORM);
- ASSERT(xfs_trans_get_block_res(tp) > 0);
+ ASSERT(tp->t_blk_res > 0);
/*
* If this spans a realtime extent boundary,
* chop it back to the start of the one we end at.
@@ -5202,7 +5299,7 @@ xfs_bunmapi(
del.br_startblock += mod;
} else if ((del.br_startoff == start &&
(del.br_state == XFS_EXT_UNWRITTEN ||
- xfs_trans_get_block_res(tp) == 0)) ||
+ tp->t_blk_res == 0)) ||
!xfs_sb_version_hasextflgbit(&mp->m_sb)) {
/*
* Can't make it unwritten. There isn't
@@ -5257,9 +5354,37 @@ xfs_bunmapi(
goto nodelete;
}
}
+
+ /*
+ * If it's the case where the directory code is running
+ * with no block reservation, and the deleted block is in
+ * the middle of its extent, and the resulting insert
+ * of an extent would cause transformation to btree format,
+ * then reject it. The calling code will then swap
+ * blocks around instead.
+ * We have to do this now, rather than waiting for the
+ * conversion to btree format, since the transaction
+ * will be dirty.
+ */
+ if (!wasdel && tp->t_blk_res == 0 &&
+ XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+ XFS_IFORK_MAXEXT(ip, whichfork) &&
+ del.br_startoff > got.br_startoff &&
+ del.br_startoff + del.br_blockcount <
+ got.br_startoff + got.br_blockcount) {
+ error = -ENOSPC;
+ goto error0;
+ }
+
+ /*
+ * Unreserve quota and update realtime free space, if
+ * appropriate. If delayed allocation, update the inode delalloc
+ * counter now and wait to update the sb counters as
+ * xfs_bmap_del_extent() might need to borrow some blocks.
+ */
if (wasdel) {
ASSERT(startblockval(del.br_startblock) > 0);
- /* Update realtime/data freespace, unreserve quota */
if (isrt) {
xfs_filblks_t rtexts;
@@ -5270,8 +5395,6 @@ xfs_bunmapi(
ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_RTBLKS);
} else {
- xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
- false);
(void)xfs_trans_reserve_quota_nblks(NULL,
ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_REGBLKS);
@@ -5282,32 +5405,16 @@ xfs_bunmapi(
XFS_BTCUR_BPRV_WASDEL;
} else if (cur)
cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
- /*
- * If it's the case where the directory code is running
- * with no block reservation, and the deleted block is in
- * the middle of its extent, and the resulting insert
- * of an extent would cause transformation to btree format,
- * then reject it. The calling code will then swap
- * blocks around instead.
- * We have to do this now, rather than waiting for the
- * conversion to btree format, since the transaction
- * will be dirty.
- */
- if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
- XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
- XFS_IFORK_MAXEXT(ip, whichfork) &&
- del.br_startoff > got.br_startoff &&
- del.br_startoff + del.br_blockcount <
- got.br_startoff + got.br_blockcount) {
- error = -ENOSPC;
- goto error0;
- }
+
error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
&tmp_logflags, whichfork);
logflags |= tmp_logflags;
if (error)
goto error0;
+
+ if (!isrt && wasdel)
+ xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
+
bno = del.br_startoff - 1;
nodelete:
/*
@@ -5917,7 +6024,6 @@ xfs_bmap_split_extent(
struct xfs_trans *tp;
struct xfs_bmap_free free_list;
xfs_fsblock_t firstfsb;
- int committed;
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
@@ -5938,7 +6044,7 @@ xfs_bmap_split_extent(
if (error)
goto out;
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 6aaa0c1c7200..423a34e832bd 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -52,9 +52,9 @@ struct xfs_bmalloca {
xfs_extlen_t minleft; /* amount must be left after alloc */
bool eof; /* set if allocating past last extent */
bool wasdel; /* replacing a delayed allocation */
- bool userdata;/* set if is user data */
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
+ char userdata;/* userdata mask */
int flags;
};
@@ -109,6 +109,14 @@ typedef struct xfs_bmap_free
*/
#define XFS_BMAPI_CONVERT 0x040
+/*
+ * allocate zeroed extents - this requires all newly allocated user data extents
+ * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
+ * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
+ * during the allocation range to zeroed written extents.
+ */
+#define XFS_BMAPI_ZERO 0x080
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -116,7 +124,8 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
- { XFS_BMAPI_CONVERT, "CONVERT" }
+ { XFS_BMAPI_CONVERT, "CONVERT" }, \
+ { XFS_BMAPI_ZERO, "ZERO" }
static inline int xfs_bmapi_aflag(int w)
@@ -186,7 +195,7 @@ void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
struct xfs_bmap_free *flist, struct xfs_mount *mp);
void xfs_bmap_cancel(struct xfs_bmap_free *flist);
int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
- int *committed);
+ struct xfs_inode *ip);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 6b0cf6546a82..6282f6e708af 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -461,7 +461,7 @@ xfs_bmbt_alloc_block(
* reservation amount is insufficient then we may fail a
* block allocation here and corrupt the filesystem.
*/
- args.minleft = xfs_trans_get_block_res(args.tp);
+ args.minleft = args.tp->t_blk_res;
} else if (cur->bc_private.b.flist->xbf_low) {
args.type = XFS_ALLOCTYPE_START_BNO;
} else {
@@ -470,7 +470,7 @@ xfs_bmbt_alloc_block(
args.minlen = args.maxlen = args.prod = 1;
args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
- if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+ if (!args.wasdel && args.tp->t_blk_res == 0) {
error = -ENOSPC;
goto error0;
}
@@ -531,7 +531,6 @@ xfs_bmbt_free_block(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
- xfs_trans_binval(tp, bp);
return 0;
}
@@ -720,6 +719,7 @@ xfs_bmbt_write_verify(
}
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+ .name = "xfs_bmbt",
.verify_read = xfs_bmbt_read_verify,
.verify_write = xfs_bmbt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f7d7ee7a2607..1f88e1ce770f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -32,6 +32,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_alloc.h"
+#include "xfs_log.h"
/*
* Cursor allocation zone.
@@ -222,7 +223,7 @@ xfs_btree_check_ptr(
* long-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
* it to disk.
*/
void
@@ -243,8 +244,14 @@ bool
xfs_btree_lblock_verify_crc(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+ }
return true;
}
@@ -254,7 +261,7 @@ xfs_btree_lblock_verify_crc(
* short-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
* it to disk.
*/
void
@@ -275,12 +282,33 @@ bool
xfs_btree_sblock_verify_crc(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+ }
return true;
}
+static int
+xfs_btree_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ int error;
+
+ error = cur->bc_ops->free_block(cur, bp);
+ if (!error) {
+ xfs_trans_binval(cur->bc_tp, bp);
+ XFS_BTREE_STATS_INC(cur, free);
+ }
+ return error;
+}
+
/*
* Delete the btree cursor.
*/
@@ -3196,6 +3224,7 @@ xfs_btree_kill_iroot(
int level;
int index;
int numrecs;
+ int error;
#ifdef DEBUG
union xfs_btree_ptr ptr;
int i;
@@ -3259,8 +3288,6 @@ xfs_btree_kill_iroot(
cpp = xfs_btree_ptr_addr(cur, 1, cblock);
#ifdef DEBUG
for (i = 0; i < numrecs; i++) {
- int error;
-
error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
if (error) {
XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
@@ -3270,8 +3297,11 @@ xfs_btree_kill_iroot(
#endif
xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
- cur->bc_ops->free_block(cur, cbp);
- XFS_BTREE_STATS_INC(cur, free);
+ error = xfs_btree_free_block(cur, cbp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
cur->bc_bufs[level - 1] = NULL;
be16_add_cpu(&block->bb_level, -1);
@@ -3304,14 +3334,12 @@ xfs_btree_kill_root(
*/
cur->bc_ops->set_root(cur, newroot, -1);
- error = cur->bc_ops->free_block(cur, bp);
+ error = xfs_btree_free_block(cur, bp);
if (error) {
XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
- XFS_BTREE_STATS_INC(cur, free);
-
cur->bc_bufs[level] = NULL;
cur->bc_ra[level] = 0;
cur->bc_nlevels--;
@@ -3817,10 +3845,9 @@ xfs_btree_delrec(
}
/* Free the deleted block. */
- error = cur->bc_ops->free_block(cur, rbp);
+ error = xfs_btree_free_block(cur, rbp);
if (error)
goto error0;
- XFS_BTREE_STATS_INC(cur, free);
/*
* If we joined with the left neighbor, set the buffer in the
@@ -4067,3 +4094,61 @@ xfs_btree_change_owner(
return 0;
}
+
+/**
+ * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
+ * btree block
+ *
+ * @bp: buffer containing the btree block
+ * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
+ * @pag_max_level: pointer to the per-ag max level field
+ */
+bool
+xfs_btree_sblock_v5hdr_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return false;
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ return false;
+ if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+ return false;
+ if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ return false;
+ return true;
+}
+
+/**
+ * xfs_btree_sblock_verify() -- verify a short-format btree block
+ *
+ * @bp: buffer containing the btree block
+ * @max_recs: maximum records allowed in this btree node
+ */
+bool
+xfs_btree_sblock_verify(
+ struct xfs_buf *bp,
+ unsigned int max_recs)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ /* numrecs verification */
+ if (be16_to_cpu(block->bb_numrecs) > max_recs)
+ return false;
+
+ /* sibling pointer verification */
+ if (!block->bb_u.s.bb_leftsib ||
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+ if (!block->bb_u.s.bb_rightsib ||
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+ block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+ return false;
+
+ return true;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 8f18bab73ea5..2e874be70209 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -84,31 +84,38 @@ union xfs_btree_rec {
/*
* Generic stats interface
*/
-#define __XFS_BTREE_STATS_INC(type, stat) \
- XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
-#define XFS_BTREE_STATS_INC(cur, stat) \
+#define __XFS_BTREE_STATS_INC(mp, type, stat) \
+ XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat) \
do { \
+ struct xfs_mount *__mp = cur->bc_mp; \
switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
-#define __XFS_BTREE_STATS_ADD(type, stat, val) \
- XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \
+ XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val)
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
do { \
+ struct xfs_mount *__mp = cur->bc_mp; \
switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
- case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ case XFS_BTNUM_BNO: \
+ __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \
+ case XFS_BTNUM_CNT: \
+ __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \
+ case XFS_BTNUM_BMAP: \
+ __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \
+ case XFS_BTNUM_INO: \
+ __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
+ case XFS_BTNUM_FINO: \
+ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -465,4 +472,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
#define XFS_BTREE_TRACE_ARGR(c, r)
#define XFS_BTREE_TRACE_CURSOR(c, t)
+bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
+bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index be43248a5822..097bf7717d80 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -39,6 +39,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
/*
* xfs_da_btree.c
@@ -150,6 +151,8 @@ xfs_da3_node_verify(
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+ return false;
} else {
if (ichdr.magic != XFS_DA_NODE_MAGIC)
return false;
@@ -242,6 +245,7 @@ xfs_da3_node_read_verify(
}
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+ .name = "xfs_da3_node",
.verify_read = xfs_da3_node_read_verify,
.verify_write = xfs_da3_node_write_verify,
};
@@ -322,6 +326,7 @@ xfs_da3_node_create(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+ memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
ichdr.magic = XFS_DA3_NODE_MAGIC;
hdr3->info.blkno = cpu_to_be64(bp->b_bn);
hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index b14bbd6bb05f..8d4d8bce41bf 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -641,6 +641,22 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
*/
#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+ struct xfs_attr_sf_hdr { /* constant-structure header block */
+ __be16 totsize; /* total bytes in shortform list */
+ __u8 count; /* count of active entries */
+ } hdr;
+ struct xfs_attr_sf_entry {
+ __uint8_t namelen; /* actual length of name (no NULL) */
+ __uint8_t valuelen; /* actual length of value (no NULL) */
+ __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
+ __uint8_t nameval[1]; /* name & value bytes concatenated */
+ } list[1]; /* variable sized array */
+} xfs_attr_shortform_t;
+
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
__be16 base; /* base of free region */
__be16 size; /* length of free region */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 9de401d297e5..af0f9d171f8a 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -176,7 +176,7 @@ xfs_dir_isempty(
{
xfs_dir2_sf_hdr_t *sfp;
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
if (dp->i_d.di_size == 0) /* might happen during shutdown. */
return 1;
if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
@@ -231,7 +231,7 @@ xfs_dir_init(
struct xfs_da_args *args;
int error;
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
if (error)
return error;
@@ -266,12 +266,12 @@ xfs_dir_createname(
int rval;
int v; /* type-checking value */
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
if (inum) {
rval = xfs_dir_ino_validate(tp->t_mountp, inum);
if (rval)
return rval;
- XFS_STATS_INC(xs_dir_create);
+ XFS_STATS_INC(dp->i_mount, xs_dir_create);
}
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
@@ -364,8 +364,8 @@ xfs_dir_lookup(
int v; /* type-checking value */
int lock_mode;
- ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_lookup);
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
/*
* We need to use KM_NOFS here so that lockdep will not throw false
@@ -443,8 +443,8 @@ xfs_dir_removename(
int rval;
int v; /* type-checking value */
- ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_remove);
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ XFS_STATS_INC(dp->i_mount, xs_dir_remove);
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
if (!args)
@@ -505,7 +505,7 @@ xfs_dir_replace(
int rval;
int v; /* type-checking value */
- ASSERT(S_ISDIR(dp->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
rval = xfs_dir_ino_validate(tp->t_mountp, inum);
if (rval)
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 4778d1dd511a..aa17cb788946 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -34,6 +34,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Local function prototypes.
@@ -71,6 +72,8 @@ xfs_dir3_block_verify(
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
return false;
@@ -120,6 +123,7 @@ xfs_dir3_block_write_verify(
}
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+ .name = "xfs_dir3_block",
.verify_read = xfs_dir3_block_read_verify,
.verify_write = xfs_dir3_block_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 824131e71bc5..725fc7841fde 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -31,6 +31,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Check the consistency of the data block.
@@ -224,6 +225,8 @@ xfs_dir3_data_verify(
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
return false;
@@ -302,11 +305,13 @@ xfs_dir3_data_write_verify(
}
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+ .name = "xfs_dir3_data",
.verify_read = xfs_dir3_data_read_verify,
.verify_write = xfs_dir3_data_write_verify,
};
static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+ .name = "xfs_dir3_data_reada",
.verify_read = xfs_dir3_data_reada_verify,
.verify_write = xfs_dir3_data_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index f300240ebb8d..b887fb2a2bcf 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -33,6 +33,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Local function declarations.
@@ -164,6 +165,8 @@ xfs_dir3_leaf_verify(
return false;
if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
+ return false;
} else {
if (leaf->hdr.info.magic != cpu_to_be16(magic))
return false;
@@ -242,11 +245,13 @@ xfs_dir3_leafn_write_verify(
}
const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+ .name = "xfs_dir3_leaf1",
.verify_read = xfs_dir3_leaf1_read_verify,
.verify_write = xfs_dir3_leaf1_write_verify,
};
const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+ .name = "xfs_dir3_leafn",
.verify_read = xfs_dir3_leafn_read_verify,
.verify_write = xfs_dir3_leafn_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index cc28e924545b..75a557432d0f 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -33,6 +33,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Function declarations.
@@ -97,6 +98,8 @@ xfs_dir3_free_verify(
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
return false;
@@ -147,6 +150,7 @@ xfs_dir3_free_write_verify(
}
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+ .name = "xfs_dir3_free",
.verify_read = xfs_dir3_free_read_verify,
.verify_write = xfs_dir3_free_write_verify,
};
@@ -2231,6 +2235,9 @@ xfs_dir2_node_trim_free(
dp = args->dp;
tp = args->trans;
+
+ *rvalp = 0;
+
/*
* Read the freespace block.
*/
@@ -2251,7 +2258,6 @@ xfs_dir2_node_trim_free(
*/
if (freehdr.nused > 0) {
xfs_trans_brelse(tp, bp);
- *rvalp = 0;
return 0;
}
/*
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 5331b7f0460c..3cc3cf767474 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -54,7 +54,7 @@ xfs_dqcheck(
xfs_dqid_t id,
uint type, /* used only when IO_dorepair is true */
uint flags,
- char *str)
+ const char *str)
{
xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
int errs = 0;
@@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc(
STATIC bool
xfs_dquot_buf_verify(
struct xfs_mount *mp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ int warn)
{
struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
xfs_dqid_t id = 0;
@@ -240,8 +241,7 @@ xfs_dquot_buf_verify(
if (i == 0)
id = be32_to_cpu(ddq->d_id);
- error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
- "xfs_dquot_buf_verify");
+ error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__);
if (error)
return false;
}
@@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify(
if (!xfs_dquot_buf_verify_crc(mp, bp))
xfs_buf_ioerror(bp, -EFSBADCRC);
- else if (!xfs_dquot_buf_verify(mp, bp))
+ else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN))
xfs_buf_ioerror(bp, -EFSCORRUPTED);
if (bp->b_error)
@@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify(
}
/*
+ * readahead errors are silent and simply leave the buffer as !done so a real
+ * read will then be run with the xfs_dquot_buf_ops verifier. See
+ * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than
+ * reporting the failure.
+ */
+static void
+xfs_dquot_buf_readahead_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify_crc(mp, bp) ||
+ !xfs_dquot_buf_verify(mp, bp, 0)) {
+ xfs_buf_ioerror(bp, -EIO);
+ bp->b_flags &= ~XBF_DONE;
+ }
+}
+
+/*
* we don't calculate the CRC here as that is done when the dquot is flushed to
* the buffer after the update is done. This ensures that the dquot in the
* buffer always has an up-to-date CRC value.
@@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if (!xfs_dquot_buf_verify(mp, bp)) {
+ if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) {
xfs_buf_ioerror(bp, -EFSCORRUPTED);
xfs_verifier_error(bp);
return;
@@ -282,7 +301,13 @@ xfs_dquot_buf_write_verify(
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
+ .name = "xfs_dquot",
.verify_read = xfs_dquot_buf_read_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
+const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
+ .name = "xfs_dquot_ra",
+ .verify_read = xfs_dquot_buf_readahead_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+};
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9590a069e556..dc97eb21af07 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -60,6 +60,14 @@ struct xfs_ifork;
#define XFS_SB_VERSION_MOREBITSBIT 0x8000
/*
+ * The size of a single extended attribute on disk is limited by
+ * the size of index values within the attribute entries themselves.
+ * These are be16 fields, so we can only support attribute data
+ * sizes up to 2^16 bytes in length.
+ */
+#define XFS_XATTR_SIZE_MAX (1 << 16)
+
+/*
* Supported feature bit list is just all bits in the versionnum field because
* we've used them all up and understand them all. Except, of course, for the
* shared superblock bit, which nobody knows what it does and so is unsupported.
@@ -778,7 +786,7 @@ typedef struct xfs_agfl {
__be64 agfl_lsn;
__be32 agfl_crc;
__be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
-} xfs_agfl_t;
+} __attribute__((packed)) xfs_agfl_t;
#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
@@ -976,8 +984,6 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
/*
* Values for di_flags
- * There should be a one-to-one correspondence between these flags and the
- * XFS_XFLAG_s.
*/
#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */
#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */
@@ -1018,6 +1024,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
/*
+ * Values for di_flags2 These start by being exposed to userspace in the upper
+ * 16 bits of the XFS_XFLAG_s range.
+ */
+#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */
+#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
+
+#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX)
+
+/*
* Inode number format:
* low inopblog bits - offset in block
* next agblklog bits - block number in ag
@@ -1483,13 +1498,17 @@ struct xfs_acl {
*/
#define XFS_ACL_MAX_ENTRIES(mp) \
(xfs_sb_version_hascrc(&mp->m_sb) \
- ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
sizeof(struct xfs_acl_entry) \
: 25)
-#define XFS_ACL_MAX_SIZE(mp) \
+#define XFS_ACL_SIZE(cnt) \
(sizeof(struct xfs_acl) + \
- sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+ sizeof(struct xfs_acl_entry) * cnt)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp)))
+
/* On-disk XFS extended attribute names */
#define SGI_ACL_FILE "SGI_ACL_FILE"
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 89689c6a43e2..fffe3d01bd9f 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -36,40 +36,6 @@ struct dioattr {
#endif
/*
- * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
- */
-#ifndef HAVE_FSXATTR
-struct fsxattr {
- __u32 fsx_xflags; /* xflags field value (get/set) */
- __u32 fsx_extsize; /* extsize field value (get/set)*/
- __u32 fsx_nextents; /* nextents field value (get) */
- __u32 fsx_projid; /* project identifier (get/set) */
- unsigned char fsx_pad[12];
-};
-#endif
-
-/*
- * Flags for the bs_xflags/fsx_xflags field
- * There should be a one-to-one correspondence between these flags and the
- * XFS_DIFLAG_s.
- */
-#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */
-#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */
-#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */
-#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */
-#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */
-#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */
-#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */
-#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */
-#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */
-#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */
-#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */
-#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */
-#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */
-#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */
-#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */
-
-/*
* Structure for XFS_IOC_GETBMAP.
* On input, fill in bmv_offset and bmv_length of the first structure
* to indicate the area of interest in the file, and bmv_entries with
@@ -490,6 +456,16 @@ typedef struct xfs_swapext
#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
/*
+ * ioctl limits
+ */
+#ifdef XATTR_LIST_MAX
+# define XFS_XATTR_LIST_MAX XATTR_LIST_MAX
+#else
+# define XFS_XATTR_LIST_MAX 65536
+#endif
+
+
+/*
* ioctl commands that are used by Linux filesystems
*/
#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
@@ -504,8 +480,8 @@ typedef struct xfs_swapext
#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64)
#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64)
#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr)
-#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr)
-#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR
+#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR
#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64)
#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64)
#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 54deb2d12ac6..22297f9b0fd5 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
#include "xfs_icreate_item.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
/*
@@ -2402,8 +2403,8 @@ xfs_ialloc_compute_maxlevels(
maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
XFS_INODES_PER_CHUNK_LOG;
- minleafrecs = mp->m_alloc_mnr[0];
- minnoderecs = mp->m_alloc_mnr[1];
+ minleafrecs = mp->m_inobt_mnr[0];
+ minnoderecs = mp->m_inobt_mnr[1];
maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
for (level = 1; maxblocks > 1; level++)
maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
@@ -2500,9 +2501,14 @@ xfs_agi_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+ return false;
+ if (!xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
return false;
+ }
+
/*
* Validate the magic number of the agi block.
*/
@@ -2566,6 +2572,7 @@ xfs_agi_write_verify(
}
const struct xfs_buf_ops xfs_agi_buf_ops = {
+ .name = "xfs_agi",
.verify_read = xfs_agi_read_verify,
.verify_write = xfs_agi_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index f39b285beb19..89c21d771e35 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -125,16 +125,8 @@ xfs_inobt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- xfs_fsblock_t fsbno;
- int error;
-
- fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
- error = xfs_free_extent(cur->bc_tp, fsbno, 1);
- if (error)
- return error;
-
- xfs_trans_binval(cur->bc_tp, bp);
- return error;
+ return xfs_free_extent(cur->bc_tp,
+ XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1);
}
STATIC int
@@ -221,7 +213,6 @@ xfs_inobt_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- struct xfs_perag *pag = bp->b_pag;
unsigned int level;
/*
@@ -237,14 +228,7 @@ xfs_inobt_verify(
switch (block->bb_magic) {
case cpu_to_be32(XFS_IBT_CRC_MAGIC):
case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
- return false;
- if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
- return false;
- if (pag &&
- be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+ if (!xfs_btree_sblock_v5hdr_verify(bp))
return false;
/* fall through */
case cpu_to_be32(XFS_IBT_MAGIC):
@@ -254,24 +238,12 @@ xfs_inobt_verify(
return 0;
}
- /* numrecs and level verification */
+ /* level verification */
level = be16_to_cpu(block->bb_level);
if (level >= mp->m_in_maxlevels)
return false;
- if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
- return false;
-
- /* sibling pointer verification */
- if (!block->bb_u.s.bb_leftsib ||
- (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
- if (!block->bb_u.s.bb_rightsib ||
- (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
- block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
- return false;
- return true;
+ return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]);
}
static void
@@ -304,6 +276,7 @@ xfs_inobt_write_verify(
}
const struct xfs_buf_ops xfs_inobt_buf_ops = {
+ .name = "xfs_inobt",
.verify_read = xfs_inobt_read_verify,
.verify_write = xfs_inobt_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 268c00f4f83a..9d9559eb2835 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -62,11 +62,14 @@ xfs_inobp_check(
* has not had the inode cores stamped into it. Hence for readahead, the buffer
* may be potentially invalid.
*
- * If the readahead buffer is invalid, we don't want to mark it with an error,
- * but we do want to clear the DONE status of the buffer so that a followup read
- * will re-read it from disk. This will ensure that we don't get an unnecessary
- * warnings during log recovery and we don't get unnecssary panics on debug
- * kernels.
+ * If the readahead buffer is invalid, we need to mark it with an error and
+ * clear the DONE status of the buffer so that a followup read will re-read it
+ * from disk. We don't report the error otherwise to avoid warnings during log
+ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
+ * because all we want to do is say readahead failed; there is no-one to report
+ * the error to, so this will distinguish it from a non-ra verifier failure.
+ * Changes to this readahead error behavour also need to be reflected in
+ * xfs_dquot_buf_readahead_verify().
*/
static void
xfs_inode_buf_verify(
@@ -93,6 +96,7 @@ xfs_inode_buf_verify(
XFS_RANDOM_ITOBP_INOTOBP))) {
if (readahead) {
bp->b_flags &= ~XBF_DONE;
+ xfs_buf_ioerror(bp, -EIO);
return;
}
@@ -132,11 +136,13 @@ xfs_inode_buf_write_verify(
}
const struct xfs_buf_ops xfs_inode_buf_ops = {
+ .name = "xfs_inode",
.verify_read = xfs_inode_buf_read_verify,
.verify_write = xfs_inode_buf_write_verify,
};
const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+ .name = "xxfs_inode_ra",
.verify_read = xfs_inode_buf_readahead_verify,
.verify_write = xfs_inode_buf_write_verify,
};
@@ -189,28 +195,50 @@ xfs_imap_to_bp(
}
void
-xfs_dinode_from_disk(
- xfs_icdinode_t *to,
- xfs_dinode_t *from)
+xfs_inode_from_disk(
+ struct xfs_inode *ip,
+ struct xfs_dinode *from)
{
- to->di_magic = be16_to_cpu(from->di_magic);
- to->di_mode = be16_to_cpu(from->di_mode);
- to->di_version = from ->di_version;
+ struct xfs_icdinode *to = &ip->i_d;
+ struct inode *inode = VFS_I(ip);
+
+
+ /*
+ * Convert v1 inodes immediately to v2 inode format as this is the
+ * minimum inode version format we support in the rest of the code.
+ */
+ to->di_version = from->di_version;
+ if (to->di_version == 1) {
+ set_nlink(inode, be16_to_cpu(from->di_onlink));
+ to->di_projid_lo = 0;
+ to->di_projid_hi = 0;
+ to->di_version = 2;
+ } else {
+ set_nlink(inode, be32_to_cpu(from->di_nlink));
+ to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
+ to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+ }
+
to->di_format = from->di_format;
- to->di_onlink = be16_to_cpu(from->di_onlink);
to->di_uid = be32_to_cpu(from->di_uid);
to->di_gid = be32_to_cpu(from->di_gid);
- to->di_nlink = be32_to_cpu(from->di_nlink);
- to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
- to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
to->di_flushiter = be16_to_cpu(from->di_flushiter);
- to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
- to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
- to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
- to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
- to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
- to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
+
+ /*
+ * Time is signed, so need to convert to signed 32 bit before
+ * storing in inode timestamp which may be 64 bit. Otherwise
+ * a time before epoch is converted to a time long after epoch
+ * on 64 bit systems.
+ */
+ inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
+ inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
+ inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
+ inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
+ inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
+ inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
+ inode->i_generation = be32_to_cpu(from->di_gen);
+ inode->i_mode = be16_to_cpu(from->di_mode);
+
to->di_size = be64_to_cpu(from->di_size);
to->di_nblocks = be64_to_cpu(from->di_nblocks);
to->di_extsize = be32_to_cpu(from->di_extsize);
@@ -221,42 +249,96 @@ xfs_dinode_from_disk(
to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
to->di_dmstate = be16_to_cpu(from->di_dmstate);
to->di_flags = be16_to_cpu(from->di_flags);
- to->di_gen = be32_to_cpu(from->di_gen);
if (to->di_version == 3) {
- to->di_changecount = be64_to_cpu(from->di_changecount);
+ inode->i_version = be64_to_cpu(from->di_changecount);
to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
to->di_flags2 = be64_to_cpu(from->di_flags2);
- to->di_ino = be64_to_cpu(from->di_ino);
- to->di_lsn = be64_to_cpu(from->di_lsn);
- memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
- uuid_copy(&to->di_uuid, &from->di_uuid);
}
}
void
-xfs_dinode_to_disk(
- xfs_dinode_t *to,
- xfs_icdinode_t *from)
+xfs_inode_to_disk(
+ struct xfs_inode *ip,
+ struct xfs_dinode *to,
+ xfs_lsn_t lsn)
+{
+ struct xfs_icdinode *from = &ip->i_d;
+ struct inode *inode = VFS_I(ip);
+
+ to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ to->di_onlink = 0;
+
+ to->di_version = from->di_version;
+ to->di_format = from->di_format;
+ to->di_uid = cpu_to_be32(from->di_uid);
+ to->di_gid = cpu_to_be32(from->di_gid);
+ to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
+ to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+
+ memset(to->di_pad, 0, sizeof(to->di_pad));
+ to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
+ to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
+ to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
+ to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
+ to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
+ to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+ to->di_nlink = cpu_to_be32(inode->i_nlink);
+ to->di_gen = cpu_to_be32(inode->i_generation);
+ to->di_mode = cpu_to_be16(inode->i_mode);
+
+ to->di_size = cpu_to_be64(from->di_size);
+ to->di_nblocks = cpu_to_be64(from->di_nblocks);
+ to->di_extsize = cpu_to_be32(from->di_extsize);
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
+ to->di_dmstate = cpu_to_be16(from->di_dmstate);
+ to->di_flags = cpu_to_be16(from->di_flags);
+
+ if (from->di_version == 3) {
+ to->di_changecount = cpu_to_be64(inode->i_version);
+ to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
+ to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
+ to->di_flags2 = cpu_to_be64(from->di_flags2);
+
+ to->di_ino = cpu_to_be64(ip->i_ino);
+ to->di_lsn = cpu_to_be64(lsn);
+ memset(to->di_pad2, 0, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
+ to->di_flushiter = 0;
+ } else {
+ to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ }
+}
+
+void
+xfs_log_dinode_to_disk(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to)
{
to->di_magic = cpu_to_be16(from->di_magic);
to->di_mode = cpu_to_be16(from->di_mode);
- to->di_version = from ->di_version;
+ to->di_version = from->di_version;
to->di_format = from->di_format;
- to->di_onlink = cpu_to_be16(from->di_onlink);
+ to->di_onlink = 0;
to->di_uid = cpu_to_be32(from->di_uid);
to->di_gid = cpu_to_be32(from->di_gid);
to->di_nlink = cpu_to_be32(from->di_nlink);
to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
+
to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
+
to->di_size = cpu_to_be64(from->di_size);
to->di_nblocks = cpu_to_be64(from->di_nblocks);
to->di_extsize = cpu_to_be32(from->di_extsize);
@@ -361,13 +443,10 @@ xfs_iread(
!(mp->m_flags & XFS_MOUNT_IKEEP)) {
/* initialise the on-disk inode core */
memset(&ip->i_d, 0, sizeof(ip->i_d));
- ip->i_d.di_magic = XFS_DINODE_MAGIC;
- ip->i_d.di_gen = prandom_u32();
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ VFS_I(ip)->i_generation = prandom_u32();
+ if (xfs_sb_version_hascrc(&mp->m_sb))
ip->i_d.di_version = 3;
- ip->i_d.di_ino = ip->i_ino;
- uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
- } else
+ else
ip->i_d.di_version = 2;
return 0;
}
@@ -397,7 +476,7 @@ xfs_iread(
* Otherwise, just get the truly permanent information.
*/
if (dip->di_mode) {
- xfs_dinode_from_disk(&ip->i_d, dip);
+ xfs_inode_from_disk(ip, dip);
error = xfs_iformat_fork(ip, dip);
if (error) {
#ifdef DEBUG
@@ -411,16 +490,10 @@ xfs_iread(
* Partial initialisation of the in-core inode. Just the bits
* that xfs_ialloc won't overwrite or relies on being correct.
*/
- ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
ip->i_d.di_version = dip->di_version;
- ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+ VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen);
ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
- if (dip->di_version == 3) {
- ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
- uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
- }
-
/*
* Make sure to pull in the mode here as well in
* case the inode is released without being used.
@@ -428,25 +501,10 @@ xfs_iread(
* the inode is already free and not try to mess
* with the uninitialized part of it.
*/
- ip->i_d.di_mode = 0;
- }
-
- /*
- * Automatically convert version 1 inode formats in memory to version 2
- * inode format. If the inode is modified, it will get logged and
- * rewritten as a version 2 inode. We can do this because we set the
- * superblock feature bit for v2 inodes unconditionally during mount
- * and it means the reast of the code can assume the inode version is 2
- * or higher.
- */
- if (ip->i_d.di_version == 1) {
- ip->i_d.di_version = 2;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- ip->i_d.di_nlink = ip->i_d.di_onlink;
- ip->i_d.di_onlink = 0;
- xfs_set_projid(ip, 0);
+ VFS_I(ip)->i_mode = 0;
}
+ ASSERT(ip->i_d.di_version >= 2);
ip->i_delayed_blks = 0;
/*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 9308c47f2a52..7c4dd321b215 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -20,7 +20,36 @@
struct xfs_inode;
struct xfs_dinode;
-struct xfs_icdinode;
+
+/*
+ * In memory representation of the XFS inode. This is held in the in-core struct
+ * xfs_inode and represents the current on disk values but the structure is not
+ * in on-disk format. That is, this structure is always translated to on-disk
+ * format specific structures at the appropriate time.
+ */
+struct xfs_icdinode {
+ __int8_t di_version; /* inode version */
+ __int8_t di_format; /* format of di_c data */
+ __uint16_t di_flushiter; /* incremented on flush */
+ __uint32_t di_uid; /* owner's user id */
+ __uint32_t di_gid; /* owner's group id */
+ __uint16_t di_projid_lo; /* lower part of owner's project id */
+ __uint16_t di_projid_hi; /* higher part of owner's project id */
+ xfs_fsize_t di_size; /* number of bytes in file */
+ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
+ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
+ xfs_extnum_t di_nextents; /* number of extents in data fork */
+ xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
+ __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
+ __int8_t di_aformat; /* format of attr fork's data */
+ __uint32_t di_dmevmask; /* DMIG event mask */
+ __uint16_t di_dmstate; /* DMIG state info */
+ __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
+
+ __uint64_t di_flags2; /* more random flags */
+
+ xfs_ictimestamp_t di_crtime; /* time created */
+};
/*
* Inode location information. Stored in the inode and passed to
@@ -38,8 +67,11 @@ int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
int xfs_iread(struct xfs_mount *, struct xfs_trans *,
struct xfs_inode *, uint);
void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
-void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
+void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to,
+ xfs_lsn_t lsn);
+void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
+void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
+ struct xfs_dinode *to);
#if defined(DEBUG)
void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 0defbd02f62d..11faf7df14c8 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -31,6 +31,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_attr_sf.h"
+#include "xfs_da_format.h"
kmem_zone_t *xfs_ifork_zone;
@@ -120,7 +121,7 @@ xfs_iformat_fork(
return -EFSCORRUPTED;
}
- switch (ip->i_d.di_mode & S_IFMT) {
+ switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFIFO:
case S_IFCHR:
case S_IFBLK:
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 265314690415..d54a8018b079 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -290,6 +290,7 @@ typedef struct xfs_inode_log_format_64 {
__int32_t ilf_boffset; /* off of inode in buffer */
} xfs_inode_log_format_64_t;
+
/*
* Flags for xfs_trans_log_inode flags field.
*/
@@ -360,15 +361,15 @@ typedef struct xfs_ictimestamp {
} xfs_ictimestamp_t;
/*
- * NOTE: This structure must be kept identical to struct xfs_dinode
- * except for the endianness annotations.
+ * Define the format of the inode core that is logged. This structure must be
+ * kept identical to struct xfs_dinode except for the endianness annotations.
*/
-typedef struct xfs_icdinode {
+struct xfs_log_dinode {
__uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
__uint16_t di_mode; /* mode and type of file */
__int8_t di_version; /* inode version */
__int8_t di_format; /* format of di_c data */
- __uint16_t di_onlink; /* old number of links to file */
+ __uint8_t di_pad3[2]; /* unused in v2/3 inodes */
__uint32_t di_uid; /* owner's user id */
__uint32_t di_gid; /* owner's group id */
__uint32_t di_nlink; /* number of links to file */
@@ -407,13 +408,13 @@ typedef struct xfs_icdinode {
uuid_t di_uuid; /* UUID of the filesystem */
/* structure must be padded to 64 bit alignment */
-} xfs_icdinode_t;
+};
-static inline uint xfs_icdinode_size(int version)
+static inline uint xfs_log_dinode_size(int version)
{
if (version == 3)
- return sizeof(struct xfs_icdinode);
- return offsetof(struct xfs_icdinode, di_next_unlinked);
+ return sizeof(struct xfs_log_dinode);
+ return offsetof(struct xfs_log_dinode, di_next_unlinked);
}
/*
@@ -495,6 +496,8 @@ enum xfs_blft {
XFS_BLFT_ATTR_LEAF_BUF,
XFS_BLFT_ATTR_RMT_BUF,
XFS_BLFT_SB_BUF,
+ XFS_BLFT_RTBITMAP_BUF,
+ XFS_BLFT_RTSUMMARY_BUF,
XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
};
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 1c55ccbb379d..8e385f91d660 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -60,6 +60,7 @@ typedef struct xlog_recover {
*/
#define XLOG_BC_TABLE_SIZE 64
+#define XLOG_RECOVER_CRCPASS 0
#define XLOG_RECOVER_PASS1 1
#define XLOG_RECOVER_PASS2 2
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 1b0a08379759..8eed51275bb3 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -37,7 +37,7 @@ typedef __uint16_t xfs_qwarncnt_t;
#define XFS_DQ_PROJ 0x0002 /* project quota */
#define XFS_DQ_GROUP 0x0004 /* a group quota */
#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
-#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
+#define XFS_DQ_FREEING 0x0010 /* dquot is being torn down */
#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
@@ -116,6 +116,7 @@ typedef __uint16_t xfs_qwarncnt_t;
#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
+#define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */
/*
* flags to xfs_trans_mod_dquot to indicate which field needs to be
@@ -153,7 +154,7 @@ typedef __uint16_t xfs_qwarncnt_t;
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
- xfs_dqid_t id, uint type, uint flags, char *str);
+ xfs_dqid_t id, uint type, uint flags, const char *str);
extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 9b59ffa1fc19..951c044e24e4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -42,6 +42,31 @@
*/
/*
+ * Real time buffers need verifiers to avoid runtime warnings during IO.
+ * We don't have anything to verify, however, so these are just dummy
+ * operations.
+ */
+static void
+xfs_rtbuf_verify_read(
+ struct xfs_buf *bp)
+{
+ return;
+}
+
+static void
+xfs_rtbuf_verify_write(
+ struct xfs_buf *bp)
+{
+ return;
+}
+
+const struct xfs_buf_ops xfs_rtbuf_ops = {
+ .name = "rtbuf",
+ .verify_read = xfs_rtbuf_verify_read,
+ .verify_write = xfs_rtbuf_verify_write,
+};
+
+/*
* Get a buffer for the bitmap or summary file block specified.
* The buffer is returned read and locked.
*/
@@ -68,9 +93,12 @@ xfs_rtbuf_get(
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
- mp->m_bsize, 0, &bp, NULL);
+ mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
if (error)
return error;
+
+ xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF
+ : XFS_BLFT_RTBITMAP_BUF);
*bpp = bp;
return 0;
}
@@ -983,7 +1011,7 @@ xfs_rtfree_extent(
mp->m_sb.sb_rextents) {
if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
- *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+ *(__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0;
xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
}
return 0;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 47425140f343..8a53eaa349f4 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -35,6 +35,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
+#include "xfs_log.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -163,6 +164,15 @@ xfs_mount_validate_sb(
"Filesystem can not be safely mounted by this kernel.");
return -EINVAL;
}
+ } else if (xfs_sb_version_hascrc(sbp)) {
+ /*
+ * We can't read verify the sb LSN because the read verifier is
+ * called before the log is allocated and processed. We know the
+ * log is set up before write verifier (!check_version) calls,
+ * so just check it here.
+ */
+ if (!xfs_log_check_lsn(mp, sbp->sb_lsn))
+ return -EFSCORRUPTED;
}
if (xfs_sb_version_has_pquotino(sbp)) {
@@ -669,11 +679,13 @@ xfs_sb_write_verify(
}
const struct xfs_buf_ops xfs_sb_buf_ops = {
+ .name = "xfs_sb",
.verify_read = xfs_sb_read_verify,
.verify_write = xfs_sb_write_verify,
};
const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+ .name = "xfs_sb_quiet",
.verify_read = xfs_sb_quiet_read_verify,
.verify_write = xfs_sb_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index b25bb9a343f3..961e6475a309 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -27,7 +27,6 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
extern void xfs_perag_put(struct xfs_perag *pag);
extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
-extern void xfs_sb_calc_crc(struct xfs_buf *bp);
extern void xfs_log_sb(struct xfs_trans *tp);
extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 5be529707903..81ac870834da 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -49,9 +49,11 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
extern const struct xfs_buf_ops xfs_sb_buf_ops;
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+extern const struct xfs_buf_ops xfs_rtbuf_ops;
/*
* Transaction types. Used to distinguish types of buffers. These never reach
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 8f8af05b3f13..2e2c6716b623 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -31,6 +31,7 @@
#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
/*
@@ -60,6 +61,7 @@ xfs_symlink_hdr_set(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return 0;
+ memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
dsl->sl_offset = cpu_to_be32(offset);
dsl->sl_bytes = cpu_to_be32(size);
@@ -116,6 +118,8 @@ xfs_symlink_verify(
return false;
if (dsl->sl_owner == 0)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn)))
+ return false;
return true;
}
@@ -164,6 +168,7 @@ xfs_symlink_write_verify(
}
const struct xfs_buf_ops xfs_symlink_buf_ops = {
+ .name = "xfs_symlink",
.verify_read = xfs_symlink_read_verify,
.verify_write = xfs_symlink_write_verify,
};
@@ -183,6 +188,7 @@ xfs_symlink_local_to_remote(
if (!xfs_sb_version_hascrc(&mp->m_sb)) {
bp->b_ops = NULL;
memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
return;
}
@@ -198,4 +204,6 @@ xfs_symlink_local_to_remote(
buf = bp->b_addr;
buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
+ ifp->if_bytes - 1);
}
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 4b641676f258..2d5df1f23bbc 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,16 +37,19 @@
STATIC struct posix_acl *
xfs_acl_from_disk(
- struct xfs_acl *aclp,
- int max_entries)
+ const struct xfs_acl *aclp,
+ int len,
+ int max_entries)
{
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
- struct xfs_acl_entry *ace;
+ const struct xfs_acl_entry *ace;
unsigned int count, i;
+ if (len < sizeof(*aclp))
+ return ERR_PTR(-EFSCORRUPTED);
count = be32_to_cpu(aclp->acl_cnt);
- if (count > max_entries)
+ if (count > max_entries || XFS_ACL_SIZE(count) != len)
return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -160,10 +163,11 @@ xfs_get_acl(struct inode *inode, int type)
*/
if (error == -ENOATTR)
goto out_update_cache;
+ acl = ERR_PTR(error);
goto out;
}
- acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
+ acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount));
if (IS_ERR(acl))
goto out;
@@ -248,29 +252,6 @@ xfs_set_mode(struct inode *inode, umode_t mode)
return error;
}
-static int
-xfs_acl_exists(struct inode *inode, unsigned char *name)
-{
- int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
-
- return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
- ATTR_ROOT|ATTR_KERNOVAL) == 0);
-}
-
-int
-posix_acl_access_exists(struct inode *inode)
-{
- return xfs_acl_exists(inode, SGI_ACL_FILE);
-}
-
-int
-posix_acl_default_exists(struct inode *inode)
-{
- if (!S_ISDIR(inode->i_mode))
- return 0;
- return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
-}
-
int
xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 3841b07f27bf..286fa89217f5 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -20,20 +20,18 @@
struct inode;
struct posix_acl;
-struct xfs_inode;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int posix_acl_access_exists(struct inode *inode);
-extern int posix_acl_default_exists(struct inode *inode);
#else
static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
{
return NULL;
}
# define xfs_set_acl NULL
-# define posix_acl_access_exists(inode) 0
-# define posix_acl_default_exists(inode) 0
#endif /* CONFIG_XFS_POSIX_ACL */
+
+extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
+
#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 50ab2879b9da..e49b2406d15d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,21 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
+/* flags for direct write completions */
+#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
+#define XFS_DIO_FLAG_APPEND (1 << 1)
+
+/*
+ * structure owned by writepages passed to individual writepage calls
+ */
+struct xfs_writepage_ctx {
+ struct xfs_bmbt_irec imap;
+ bool imap_valid;
+ unsigned int io_type;
+ struct xfs_ioend *ioend;
+ sector_t last_block;
+};
+
void
xfs_count_page_state(
struct page *page,
@@ -55,7 +70,7 @@ xfs_count_page_state(
} while ((bh = bh->b_this_page) != head);
}
-STATIC struct block_device *
+struct block_device *
xfs_find_bdev_for_inode(
struct inode *inode)
{
@@ -172,6 +187,12 @@ xfs_setfilesize_ioend(
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
+ /* we abort the update if there was an IO error */
+ if (ioend->io_error) {
+ xfs_trans_cancel(tp);
+ return ioend->io_error;
+ }
+
return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
@@ -208,18 +229,23 @@ xfs_end_io(
struct xfs_inode *ip = XFS_I(ioend->io_inode);
int error = 0;
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ /*
+ * Set an error if the mount has shut down and proceed with end I/O
+ * processing so it can perform whatever cleanups are necessary.
+ */
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
ioend->io_error = -EIO;
- goto done;
- }
- if (ioend->io_error)
- goto done;
/*
* For unwritten extents we need to issue transactions to convert a
* range to normal written extens after the data I/O has finished.
+ * Detecting and handling completion IO errors is done individually
+ * for each case as different cleanup operations need to be performed
+ * on error.
*/
if (ioend->io_type == XFS_IO_UNWRITTEN) {
+ if (ioend->io_error)
+ goto done;
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
} else if (ioend->io_append_trans) {
@@ -256,7 +282,7 @@ xfs_alloc_ioend(
*/
atomic_set(&ioend->io_remaining, 1);
ioend->io_error = 0;
- ioend->io_list = NULL;
+ INIT_LIST_HEAD(&ioend->io_list);
ioend->io_type = type;
ioend->io_inode = inode;
ioend->io_buffer_head = NULL;
@@ -274,8 +300,7 @@ xfs_map_blocks(
struct inode *inode,
loff_t offset,
struct xfs_bmbt_irec *imap,
- int type,
- int nonblocking)
+ int type)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -291,12 +316,7 @@ xfs_map_blocks(
if (type == XFS_IO_UNWRITTEN)
bmapi_flags |= XFS_BMAPI_IGSTATE;
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
- if (nonblocking)
- return -EAGAIN;
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- }
-
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -332,7 +352,7 @@ xfs_map_blocks(
return 0;
}
-STATIC int
+STATIC bool
xfs_imap_valid(
struct inode *inode,
struct xfs_bmbt_irec *imap,
@@ -405,8 +425,7 @@ xfs_start_buffer_writeback(
STATIC void
xfs_start_page_writeback(
struct page *page,
- int clear_dirty,
- int buffers)
+ int clear_dirty)
{
ASSERT(PageLocked(page));
ASSERT(!PageWriteback(page));
@@ -425,10 +444,6 @@ xfs_start_page_writeback(
set_page_writeback_keepwrite(page);
unlock_page(page);
-
- /* If no buffers on the page are to be written, finish it here */
- if (!buffers)
- end_page_writeback(page);
}
static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -437,153 +452,101 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
}
/*
- * Submit all of the bios for all of the ioends we have saved up, covering the
- * initial writepage page and also any probed pages.
- *
- * Because we may have multiple ioends spanning a page, we need to start
- * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
- * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
- *
- * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
- *
- * The fix is two passes across the ioend list - one to start writeback on the
- * buffer_heads, and then submit them for I/O on the second pass.
+ * Submit all of the bios for an ioend. We are only passed a single ioend at a
+ * time; the caller is responsible for chaining prior to submission.
*
* If @fail is non-zero, it means that we have a situation where some part of
* the submission process has failed after we have marked paged for writeback
* and unlocked them. In this situation, we need to fail the ioend chain rather
* than submit it to IO. This typically only happens on a filesystem shutdown.
*/
-STATIC void
+STATIC int
xfs_submit_ioend(
struct writeback_control *wbc,
xfs_ioend_t *ioend,
- int fail)
+ int status)
{
- xfs_ioend_t *head = ioend;
- xfs_ioend_t *next;
struct buffer_head *bh;
struct bio *bio;
sector_t lastblock = 0;
- /* Pass 1 - start writeback */
- do {
- next = ioend->io_list;
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
- xfs_start_buffer_writeback(bh);
- } while ((ioend = next) != NULL);
+ /* Reserve log space if we might write beyond the on-disk inode size. */
+ if (!status &&
+ ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+ status = xfs_setfilesize_trans_alloc(ioend);
+ /*
+ * If we are failing the IO now, just mark the ioend with an
+ * error and finish it. This will run IO completion immediately
+ * as there is only one reference to the ioend at this point in
+ * time.
+ */
+ if (status) {
+ ioend->io_error = status;
+ xfs_finish_ioend(ioend);
+ return status;
+ }
- /* Pass 2 - submit I/O */
- ioend = head;
- do {
- next = ioend->io_list;
- bio = NULL;
+ bio = NULL;
+ for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
- /*
- * If we are failing the IO now, just mark the ioend with an
- * error and finish it. This will run IO completion immediately
- * as there is only one reference to the ioend at this point in
- * time.
- */
- if (fail) {
- ioend->io_error = fail;
- xfs_finish_ioend(ioend);
- continue;
+ if (!bio) {
+retry:
+ bio = xfs_alloc_ioend_bio(bh);
+ } else if (bh->b_blocknr != lastblock + 1) {
+ xfs_submit_ioend_bio(wbc, ioend, bio);
+ goto retry;
}
- for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-
- if (!bio) {
- retry:
- bio = xfs_alloc_ioend_bio(bh);
- } else if (bh->b_blocknr != lastblock + 1) {
- xfs_submit_ioend_bio(wbc, ioend, bio);
- goto retry;
- }
-
- if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
- xfs_submit_ioend_bio(wbc, ioend, bio);
- goto retry;
- }
-
- lastblock = bh->b_blocknr;
- }
- if (bio)
+ if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
xfs_submit_ioend_bio(wbc, ioend, bio);
- xfs_finish_ioend(ioend);
- } while ((ioend = next) != NULL);
-}
-
-/*
- * Cancel submission of all buffer_heads so far in this endio.
- * Toss the endio too. Only ever called for the initial page
- * in a writepage request, so only ever one page.
- */
-STATIC void
-xfs_cancel_ioend(
- xfs_ioend_t *ioend)
-{
- xfs_ioend_t *next;
- struct buffer_head *bh, *next_bh;
-
- do {
- next = ioend->io_list;
- bh = ioend->io_buffer_head;
- do {
- next_bh = bh->b_private;
- clear_buffer_async_write(bh);
- /*
- * The unwritten flag is cleared when added to the
- * ioend. We're not submitting for I/O so mark the
- * buffer unwritten again for next time around.
- */
- if (ioend->io_type == XFS_IO_UNWRITTEN)
- set_buffer_unwritten(bh);
- unlock_buffer(bh);
- } while ((bh = next_bh) != NULL);
+ goto retry;
+ }
- mempool_free(ioend, xfs_ioend_pool);
- } while ((ioend = next) != NULL);
+ lastblock = bh->b_blocknr;
+ }
+ if (bio)
+ xfs_submit_ioend_bio(wbc, ioend, bio);
+ xfs_finish_ioend(ioend);
+ return 0;
}
/*
* Test to see if we've been building up a completion structure for
* earlier buffers -- if so, we try to append to this ioend if we
* can, otherwise we finish off any current ioend and start another.
- * Return true if we've finished the given ioend.
+ * Return the ioend we finished off so that the caller can submit it
+ * once it has finished processing the dirty page.
*/
STATIC void
xfs_add_to_ioend(
struct inode *inode,
struct buffer_head *bh,
xfs_off_t offset,
- unsigned int type,
- xfs_ioend_t **result,
- int need_ioend)
+ struct xfs_writepage_ctx *wpc,
+ struct list_head *iolist)
{
- xfs_ioend_t *ioend = *result;
-
- if (!ioend || need_ioend || type != ioend->io_type) {
- xfs_ioend_t *previous = *result;
-
- ioend = xfs_alloc_ioend(inode, type);
- ioend->io_offset = offset;
- ioend->io_buffer_head = bh;
- ioend->io_buffer_tail = bh;
- if (previous)
- previous->io_list = ioend;
- *result = ioend;
+ if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+ bh->b_blocknr != wpc->last_block + 1 ||
+ offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
+ struct xfs_ioend *new;
+
+ if (wpc->ioend)
+ list_add(&wpc->ioend->io_list, iolist);
+
+ new = xfs_alloc_ioend(inode, wpc->io_type);
+ new->io_offset = offset;
+ new->io_buffer_head = bh;
+ new->io_buffer_tail = bh;
+ wpc->ioend = new;
} else {
- ioend->io_buffer_tail->b_private = bh;
- ioend->io_buffer_tail = bh;
+ wpc->ioend->io_buffer_tail->b_private = bh;
+ wpc->ioend->io_buffer_tail = bh;
}
bh->b_private = NULL;
- ioend->io_size += bh->b_size;
+ wpc->ioend->io_size += bh->b_size;
+ wpc->last_block = bh->b_blocknr;
+ xfs_start_buffer_writeback(bh);
}
STATIC void
@@ -669,183 +632,6 @@ xfs_check_page_type(
return false;
}
-/*
- * Allocate & map buffers for page given the extent map. Write it out.
- * except for the original page of a writepage, this is called on
- * delalloc/unwritten pages only, for the original page it is possible
- * that the page has no mapping at all.
- */
-STATIC int
-xfs_convert_page(
- struct inode *inode,
- struct page *page,
- loff_t tindex,
- struct xfs_bmbt_irec *imap,
- xfs_ioend_t **ioendp,
- struct writeback_control *wbc)
-{
- struct buffer_head *bh, *head;
- xfs_off_t end_offset;
- unsigned long p_offset;
- unsigned int type;
- int len, page_dirty;
- int count = 0, done = 0, uptodate = 1;
- xfs_off_t offset = page_offset(page);
-
- if (page->index != tindex)
- goto fail;
- if (!trylock_page(page))
- goto fail;
- if (PageWriteback(page))
- goto fail_unlock_page;
- if (page->mapping != inode->i_mapping)
- goto fail_unlock_page;
- if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
- goto fail_unlock_page;
-
- /*
- * page_dirty is initially a count of buffers on the page before
- * EOF and is decremented as we move each into a cleanable state.
- *
- * Derivation:
- *
- * End offset is the highest offset that this page should represent.
- * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
- * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
- * hence give us the correct page_dirty count. On any other page,
- * it will be zero and in that case we need page_dirty to be the
- * count of buffers on the page.
- */
- end_offset = min_t(unsigned long long,
- (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
- i_size_read(inode));
-
- /*
- * If the current map does not span the entire page we are about to try
- * to write, then give up. The only way we can write a page that spans
- * multiple mappings in a single writeback iteration is via the
- * xfs_vm_writepage() function. Data integrity writeback requires the
- * entire page to be written in a single attempt, otherwise the part of
- * the page we don't write here doesn't get written as part of the data
- * integrity sync.
- *
- * For normal writeback, we also don't attempt to write partial pages
- * here as it simply means that write_cache_pages() will see it under
- * writeback and ignore the page until some point in the future, at
- * which time this will be the only page in the file that needs
- * writeback. Hence for more optimal IO patterns, we should always
- * avoid partial page writeback due to multiple mappings on a page here.
- */
- if (!xfs_imap_valid(inode, imap, end_offset))
- goto fail_unlock_page;
-
- len = 1 << inode->i_blkbits;
- p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
- PAGE_CACHE_SIZE);
- p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
- page_dirty = p_offset / len;
-
- /*
- * The moment we find a buffer that doesn't match our current type
- * specification or can't be written, abort the loop and start
- * writeback. As per the above xfs_imap_valid() check, only
- * xfs_vm_writepage() can handle partial page writeback fully - we are
- * limited here to the buffers that are contiguous with the current
- * ioend, and hence a buffer we can't write breaks that contiguity and
- * we have to defer the rest of the IO to xfs_vm_writepage().
- */
- bh = head = page_buffers(page);
- do {
- if (offset >= end_offset)
- break;
- if (!buffer_uptodate(bh))
- uptodate = 0;
- if (!(PageUptodate(page) || buffer_uptodate(bh))) {
- done = 1;
- break;
- }
-
- if (buffer_unwritten(bh) || buffer_delay(bh) ||
- buffer_mapped(bh)) {
- if (buffer_unwritten(bh))
- type = XFS_IO_UNWRITTEN;
- else if (buffer_delay(bh))
- type = XFS_IO_DELALLOC;
- else
- type = XFS_IO_OVERWRITE;
-
- /*
- * imap should always be valid because of the above
- * partial page end_offset check on the imap.
- */
- ASSERT(xfs_imap_valid(inode, imap, offset));
-
- lock_buffer(bh);
- if (type != XFS_IO_OVERWRITE)
- xfs_map_at_offset(inode, bh, imap, offset);
- xfs_add_to_ioend(inode, bh, offset, type,
- ioendp, done);
-
- page_dirty--;
- count++;
- } else {
- done = 1;
- break;
- }
- } while (offset += len, (bh = bh->b_this_page) != head);
-
- if (uptodate && bh == head)
- SetPageUptodate(page);
-
- if (count) {
- if (--wbc->nr_to_write <= 0 &&
- wbc->sync_mode == WB_SYNC_NONE)
- done = 1;
- }
- xfs_start_page_writeback(page, !page_dirty, count);
-
- return done;
- fail_unlock_page:
- unlock_page(page);
- fail:
- return 1;
-}
-
-/*
- * Convert & write out a cluster of pages in the same extent as defined
- * by mp and following the start page.
- */
-STATIC void
-xfs_cluster_write(
- struct inode *inode,
- pgoff_t tindex,
- struct xfs_bmbt_irec *imap,
- xfs_ioend_t **ioendp,
- struct writeback_control *wbc,
- pgoff_t tlast)
-{
- struct pagevec pvec;
- int done = 0, i;
-
- pagevec_init(&pvec, 0);
- while (!done && tindex <= tlast) {
- unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
- if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
- break;
-
- for (i = 0; i < pagevec_count(&pvec); i++) {
- done = xfs_convert_page(inode, pvec.pages[i], tindex++,
- imap, ioendp, wbc);
- if (done)
- break;
- }
-
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
STATIC void
xfs_vm_invalidatepage(
struct page *page,
@@ -918,11 +704,169 @@ next_buffer:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_invalidate:
- xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
return;
}
/*
+ * We implement an immediate ioend submission policy here to avoid needing to
+ * chain multiple ioends and hence nest mempool allocations which can violate
+ * forward progress guarantees we need to provide. The current ioend we are
+ * adding buffers to is cached on the writepage context, and if the new buffer
+ * does not append to the cached ioend it will create a new ioend and cache that
+ * instead.
+ *
+ * If a new ioend is created and cached, the old ioend is returned and queued
+ * locally for submission once the entire page is processed or an error has been
+ * detected. While ioends are submitted immediately after they are completed,
+ * batching optimisations are provided by higher level block plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+static int
+xfs_writepage_map(
+ struct xfs_writepage_ctx *wpc,
+ struct writeback_control *wbc,
+ struct inode *inode,
+ struct page *page,
+ loff_t offset,
+ __uint64_t end_offset)
+{
+ LIST_HEAD(submit_list);
+ struct xfs_ioend *ioend, *next;
+ struct buffer_head *bh, *head;
+ ssize_t len = 1 << inode->i_blkbits;
+ int error = 0;
+ int count = 0;
+ int uptodate = 1;
+
+ bh = head = page_buffers(page);
+ offset = page_offset(page);
+ do {
+ if (offset >= end_offset)
+ break;
+ if (!buffer_uptodate(bh))
+ uptodate = 0;
+
+ /*
+ * set_page_dirty dirties all buffers in a page, independent
+ * of their state. The dirty state however is entirely
+ * meaningless for holes (!mapped && uptodate), so skip
+ * buffers covering holes here.
+ */
+ if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+ wpc->imap_valid = false;
+ continue;
+ }
+
+ if (buffer_unwritten(bh)) {
+ if (wpc->io_type != XFS_IO_UNWRITTEN) {
+ wpc->io_type = XFS_IO_UNWRITTEN;
+ wpc->imap_valid = false;
+ }
+ } else if (buffer_delay(bh)) {
+ if (wpc->io_type != XFS_IO_DELALLOC) {
+ wpc->io_type = XFS_IO_DELALLOC;
+ wpc->imap_valid = false;
+ }
+ } else if (buffer_uptodate(bh)) {
+ if (wpc->io_type != XFS_IO_OVERWRITE) {
+ wpc->io_type = XFS_IO_OVERWRITE;
+ wpc->imap_valid = false;
+ }
+ } else {
+ if (PageUptodate(page))
+ ASSERT(buffer_mapped(bh));
+ /*
+ * This buffer is not uptodate and will not be
+ * written to disk. Ensure that we will put any
+ * subsequent writeable buffers into a new
+ * ioend.
+ */
+ wpc->imap_valid = false;
+ continue;
+ }
+
+ if (wpc->imap_valid)
+ wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+ offset);
+ if (!wpc->imap_valid) {
+ error = xfs_map_blocks(inode, offset, &wpc->imap,
+ wpc->io_type);
+ if (error)
+ goto out;
+ wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+ offset);
+ }
+ if (wpc->imap_valid) {
+ lock_buffer(bh);
+ if (wpc->io_type != XFS_IO_OVERWRITE)
+ xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+ xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
+ count++;
+ }
+
+ } while (offset += len, ((bh = bh->b_this_page) != head));
+
+ if (uptodate && bh == head)
+ SetPageUptodate(page);
+
+ ASSERT(wpc->ioend || list_empty(&submit_list));
+
+out:
+ /*
+ * On error, we have to fail the ioend here because we have locked
+ * buffers in the ioend. If we don't do this, we'll deadlock
+ * invalidating the page as that tries to lock the buffers on the page.
+ * Also, because we may have set pages under writeback, we have to make
+ * sure we run IO completion to mark the error state of the IO
+ * appropriately, so we can't cancel the ioend directly here. That means
+ * we have to mark this page as under writeback if we included any
+ * buffers from it in the ioend chain so that completion treats it
+ * correctly.
+ *
+ * If we didn't include the page in the ioend, the on error we can
+ * simply discard and unlock it as there are no other users of the page
+ * or it's buffers right now. The caller will still need to trigger
+ * submission of outstanding ioends on the writepage context so they are
+ * treated correctly on error.
+ */
+ if (count) {
+ xfs_start_page_writeback(page, !error);
+
+ /*
+ * Preserve the original error if there was one, otherwise catch
+ * submission errors here and propagate into subsequent ioend
+ * submissions.
+ */
+ list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+ int error2;
+
+ list_del_init(&ioend->io_list);
+ error2 = xfs_submit_ioend(wbc, ioend, error);
+ if (error2 && !error)
+ error = error2;
+ }
+ } else if (error) {
+ xfs_aops_discard_page(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ } else {
+ /*
+ * We can end up here with no error and nothing to write if we
+ * race with a partial page truncate on a sub-page block sized
+ * filesystem. In that case we need to mark the page clean.
+ */
+ xfs_start_page_writeback(page, 1);
+ end_page_writeback(page);
+ }
+
+ mapping_set_error(page->mapping, error);
+ return error;
+}
+
+/*
* Write out a dirty page.
*
* For delalloc space on the page we need to allocate space and flush it.
@@ -931,22 +875,16 @@ out_invalidate:
* For any other dirty buffer heads on the page we should flush them.
*/
STATIC int
-xfs_vm_writepage(
+xfs_do_writepage(
struct page *page,
- struct writeback_control *wbc)
+ struct writeback_control *wbc,
+ void *data)
{
+ struct xfs_writepage_ctx *wpc = data;
struct inode *inode = page->mapping->host;
- struct buffer_head *bh, *head;
- struct xfs_bmbt_irec imap;
- xfs_ioend_t *ioend = NULL, *iohead = NULL;
loff_t offset;
- unsigned int type;
__uint64_t end_offset;
- pgoff_t end_index, last_index;
- ssize_t len;
- int err, imap_valid = 0, uptodate = 1;
- int count = 0;
- int nonblocking = 0;
+ pgoff_t end_index;
trace_xfs_writepage(inode, page, 0, 0);
@@ -973,12 +911,9 @@ xfs_vm_writepage(
if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
goto redirty;
- /* Is this page beyond the end of the file? */
- offset = i_size_read(inode);
- end_index = offset >> PAGE_CACHE_SHIFT;
- last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
-
/*
+ * Is this page beyond the end of the file?
+ *
* The page index is less than the end_index, adjust the end_offset
* to the highest offset that this page should represent.
* -----------------------------------------------------
@@ -989,8 +924,10 @@ xfs_vm_writepage(
* | desired writeback range | see else |
* ---------------------------------^------------------|
*/
+ offset = i_size_read(inode);
+ end_index = offset >> PAGE_SHIFT;
if (page->index < end_index)
- end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
+ end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
else {
/*
* Check whether the page to write out is beyond or straddles
@@ -1003,7 +940,7 @@ xfs_vm_writepage(
* | | Straddles |
* ---------------------------------^-----------|--------|
*/
- unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
+ unsigned offset_into_page = offset & (PAGE_SIZE - 1);
/*
* Skip the page if it is fully outside i_size, e.g. due to a
@@ -1034,158 +971,13 @@ xfs_vm_writepage(
* memory is zeroed when mapped, and writes to that region are
* not written out to the file."
*/
- zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
+ zero_user_segment(page, offset_into_page, PAGE_SIZE);
/* Adjust the end_offset to the end of file */
end_offset = offset;
}
- len = 1 << inode->i_blkbits;
-
- bh = head = page_buffers(page);
- offset = page_offset(page);
- type = XFS_IO_OVERWRITE;
-
- if (wbc->sync_mode == WB_SYNC_NONE)
- nonblocking = 1;
-
- do {
- int new_ioend = 0;
-
- if (offset >= end_offset)
- break;
- if (!buffer_uptodate(bh))
- uptodate = 0;
-
- /*
- * set_page_dirty dirties all buffers in a page, independent
- * of their state. The dirty state however is entirely
- * meaningless for holes (!mapped && uptodate), so skip
- * buffers covering holes here.
- */
- if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
- imap_valid = 0;
- continue;
- }
-
- if (buffer_unwritten(bh)) {
- if (type != XFS_IO_UNWRITTEN) {
- type = XFS_IO_UNWRITTEN;
- imap_valid = 0;
- }
- } else if (buffer_delay(bh)) {
- if (type != XFS_IO_DELALLOC) {
- type = XFS_IO_DELALLOC;
- imap_valid = 0;
- }
- } else if (buffer_uptodate(bh)) {
- if (type != XFS_IO_OVERWRITE) {
- type = XFS_IO_OVERWRITE;
- imap_valid = 0;
- }
- } else {
- if (PageUptodate(page))
- ASSERT(buffer_mapped(bh));
- /*
- * This buffer is not uptodate and will not be
- * written to disk. Ensure that we will put any
- * subsequent writeable buffers into a new
- * ioend.
- */
- imap_valid = 0;
- continue;
- }
-
- if (imap_valid)
- imap_valid = xfs_imap_valid(inode, &imap, offset);
- if (!imap_valid) {
- /*
- * If we didn't have a valid mapping then we need to
- * put the new mapping into a separate ioend structure.
- * This ensures non-contiguous extents always have
- * separate ioends, which is particularly important
- * for unwritten extent conversion at I/O completion
- * time.
- */
- new_ioend = 1;
- err = xfs_map_blocks(inode, offset, &imap, type,
- nonblocking);
- if (err)
- goto error;
- imap_valid = xfs_imap_valid(inode, &imap, offset);
- }
- if (imap_valid) {
- lock_buffer(bh);
- if (type != XFS_IO_OVERWRITE)
- xfs_map_at_offset(inode, bh, &imap, offset);
- xfs_add_to_ioend(inode, bh, offset, type, &ioend,
- new_ioend);
- count++;
- }
-
- if (!iohead)
- iohead = ioend;
-
- } while (offset += len, ((bh = bh->b_this_page) != head));
-
- if (uptodate && bh == head)
- SetPageUptodate(page);
-
- xfs_start_page_writeback(page, 1, count);
-
- /* if there is no IO to be submitted for this page, we are done */
- if (!ioend)
- return 0;
-
- ASSERT(iohead);
-
- /*
- * Any errors from this point onwards need tobe reported through the IO
- * completion path as we have marked the initial page as under writeback
- * and unlocked it.
- */
- if (imap_valid) {
- xfs_off_t end_index;
-
- end_index = imap.br_startoff + imap.br_blockcount;
-
- /* to bytes */
- end_index <<= inode->i_blkbits;
-
- /* to pages */
- end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
-
- /* check against file size */
- if (end_index > last_index)
- end_index = last_index;
-
- xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
- wbc, end_index);
- }
-
-
- /*
- * Reserve log space if we might write beyond the on-disk inode size.
- */
- err = 0;
- if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
- err = xfs_setfilesize_trans_alloc(ioend);
-
- xfs_submit_ioend(wbc, iohead, err);
-
- return 0;
-
-error:
- if (iohead)
- xfs_cancel_ioend(iohead);
-
- if (err == -EAGAIN)
- goto redirty;
-
- xfs_aops_discard_page(page);
- ClearPageUptodate(page);
- unlock_page(page);
- return err;
+ return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
redirty:
redirty_page_for_writepage(wbc, page);
@@ -1194,12 +986,40 @@ redirty:
}
STATIC int
+xfs_vm_writepage(
+ struct page *page,
+ struct writeback_control *wbc)
+{
+ struct xfs_writepage_ctx wpc = {
+ .io_type = XFS_IO_INVALID,
+ };
+ int ret;
+
+ ret = xfs_do_writepage(page, wbc, &wpc);
+ if (wpc.ioend)
+ ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+ return ret;
+}
+
+STATIC int
xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct xfs_writepage_ctx wpc = {
+ .io_type = XFS_IO_INVALID,
+ };
+ int ret;
+
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
- return generic_writepages(mapping, wbc);
+ if (dax_mapping(mapping))
+ return dax_writeback_mapping_range(mapping,
+ xfs_find_bdev_for_inode(mapping->host), wbc);
+
+ ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
+ if (wpc.ioend)
+ ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+ return ret;
}
/*
@@ -1229,27 +1049,15 @@ xfs_vm_releasepage(
}
/*
- * When we map a DIO buffer, we may need to attach an ioend that describes the
- * type of write IO we are doing. This passes to the completion function the
- * operations it needs to perform. If the mapping is for an overwrite wholly
- * within the EOF then we don't need an ioend and so we don't allocate one.
- * This avoids the unnecessary overhead of allocating and freeing ioends for
- * workloads that don't require transactions on IO completion.
- *
- * If we get multiple mappings in a single IO, we might be mapping different
- * types. But because the direct IO can only have a single private pointer, we
- * need to ensure that:
+ * When we map a DIO buffer, we may need to pass flags to
+ * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
*
- * a) i) the ioend spans the entire region of unwritten mappings; or
- * ii) the ioend spans all the mappings that cross or are beyond EOF; and
- * b) if it contains unwritten extents, it is *permanently* marked as such
- *
- * We could do this by chaining ioends like buffered IO does, but we only
- * actually get one IO completion callback from the direct IO, and that spans
- * the entire IO regardless of how many mappings and IOs are needed to complete
- * the DIO. There is only going to be one reference to the ioend and its life
- * cycle is constrained by the DIO completion code. hence we don't need
- * reference counting here.
+ * Note that for DIO, an IO to the highest supported file block offset (i.e.
+ * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
+ * bit variable. Hence if we see this overflow, we have to assume that the IO is
+ * extending the file size. We won't know for sure until IO completion is run
+ * and the actual max write offset is communicated to the IO completion
+ * routine.
*/
static void
xfs_map_direct(
@@ -1258,44 +1066,18 @@ xfs_map_direct(
struct xfs_bmbt_irec *imap,
xfs_off_t offset)
{
- struct xfs_ioend *ioend;
+ uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
xfs_off_t size = bh_result->b_size;
- int type;
- if (ISUNWRITTEN(imap))
- type = XFS_IO_UNWRITTEN;
- else
- type = XFS_IO_OVERWRITE;
-
- trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
-
- if (bh_result->b_private) {
- ioend = bh_result->b_private;
- ASSERT(ioend->io_size > 0);
- ASSERT(offset >= ioend->io_offset);
- if (offset + size > ioend->io_offset + ioend->io_size)
- ioend->io_size = offset - ioend->io_offset + size;
-
- if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
- ioend->io_type = XFS_IO_UNWRITTEN;
-
- trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
- ioend->io_size, ioend->io_type,
- imap);
- } else if (type == XFS_IO_UNWRITTEN ||
- offset + size > i_size_read(inode)) {
- ioend = xfs_alloc_ioend(inode, type);
- ioend->io_offset = offset;
- ioend->io_size = size;
-
- bh_result->b_private = ioend;
- set_buffer_defer_completion(bh_result);
+ trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
+ ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
- trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
- imap);
- } else {
- trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
- imap);
+ if (ISUNWRITTEN(imap)) {
+ *flags |= XFS_DIO_FLAG_UNWRITTEN;
+ set_buffer_defer_completion(bh_result);
+ } else if (offset + size > i_size_read(inode) || offset + size < 0) {
+ *flags |= XFS_DIO_FLAG_APPEND;
+ set_buffer_defer_completion(bh_result);
}
}
@@ -1345,7 +1127,8 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
- bool direct)
+ bool direct,
+ bool dax_fault)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1393,18 +1176,20 @@ __xfs_get_blocks(
if (error)
goto out_unlock;
+ /* for DAX, we convert unwritten extents directly */
if (create &&
(!nimaps ||
(imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_startblock == DELAYSTARTBLOCK))) {
+ imap.br_startblock == DELAYSTARTBLOCK) ||
+ (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
if (direct || xfs_get_extsz_hint(ip)) {
/*
- * Drop the ilock in preparation for starting the block
- * allocation transaction. It will be retaken
- * exclusively inside xfs_iomap_write_direct for the
- * actual allocation.
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
*/
- xfs_iunlock(ip, lockmode);
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
+
error = xfs_iomap_write_direct(ip, offset, size,
&imap, nimaps);
if (error)
@@ -1441,6 +1226,12 @@ __xfs_get_blocks(
goto out_unlock;
}
+ if (IS_DAX(inode) && create) {
+ ASSERT(!ISUNWRITTEN(&imap));
+ /* zeroing is not needed at a higher layer */
+ new = 0;
+ }
+
/* trim mapping down to size requested */
if (direct || size > (1 << inode->i_blkbits))
xfs_map_trim_size(inode, iblock, bh_result,
@@ -1457,8 +1248,12 @@ __xfs_get_blocks(
if (ISUNWRITTEN(&imap))
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
- if (create && direct)
- xfs_map_direct(inode, bh_result, &imap, offset);
+ if (create && direct) {
+ if (dax_fault)
+ ASSERT(!ISUNWRITTEN(&imap));
+ else
+ xfs_map_direct(inode, bh_result, &imap, offset);
+ }
}
/*
@@ -1505,7 +1300,7 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, false);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
}
int
@@ -1515,45 +1310,63 @@ xfs_get_blocks_direct(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
}
-static void
-__xfs_end_io_direct_write(
+int
+xfs_get_blocks_dax_fault(
struct inode *inode,
- struct xfs_ioend *ioend,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create)
+{
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
+}
+
+/*
+ * Complete a direct I/O write request.
+ *
+ * xfs_map_direct passes us some flags in the private data to tell us what to
+ * do. If no flags are set, then the write IO is an overwrite wholly within
+ * the existing allocated file size and so there is nothing for us to do.
+ *
+ * Note that in this case the completion can be called in interrupt context,
+ * whereas if we have flags set we will always be called in task context
+ * (i.e. from a workqueue).
+ */
+STATIC int
+xfs_end_io_direct_write(
+ struct kiocb *iocb,
loff_t offset,
- ssize_t size)
+ ssize_t size,
+ void *private)
{
- struct xfs_mount *mp = XFS_I(inode)->i_mount;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ uintptr_t flags = (uintptr_t)private;
+ int error = 0;
- if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
- goto out_end_io;
+ trace_xfs_end_io_direct_write(ip, offset, size);
- /*
- * dio completion end_io functions are only called on writes if more
- * than 0 bytes was written.
- */
- ASSERT(size > 0);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
- /*
- * The ioend only maps whole blocks, while the IO may be sector aligned.
- * Hence the ioend offset/size may not match the IO offset/size exactly.
- * Because we don't map overwrites within EOF into the ioend, the offset
- * may not match, but only if the endio spans EOF. Either way, write
- * the IO sizes into the ioend so that completion processing does the
- * right thing.
- */
- ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
- ioend->io_size = size;
- ioend->io_offset = offset;
+ if (size <= 0)
+ return size;
/*
- * The ioend tells us whether we are doing unwritten extent conversion
+ * The flags tell us whether we are doing unwritten extent conversions
* or an append transaction that updates the on-disk file size. These
* cases are the only cases where we should *potentially* be needing
* to update the VFS inode size.
- *
+ */
+ if (flags == 0) {
+ ASSERT(offset + size <= i_size_read(inode));
+ return 0;
+ }
+
+ /*
* We need to update the in-core inode size here so that we don't end up
* with the on-disk inode size being outside the in-core inode size. We
* have no other method of updating EOF for AIO, so always do it here
@@ -1564,130 +1377,56 @@ __xfs_end_io_direct_write(
* here can result in EOF moving backwards and Bad Things Happen when
* that occurs.
*/
- spin_lock(&XFS_I(inode)->i_flags_lock);
+ spin_lock(&ip->i_flags_lock);
if (offset + size > i_size_read(inode))
i_size_write(inode, offset + size);
- spin_unlock(&XFS_I(inode)->i_flags_lock);
+ spin_unlock(&ip->i_flags_lock);
- /*
- * If we are doing an append IO that needs to update the EOF on disk,
- * do the transaction reserve now so we can use common end io
- * processing. Stashing the error (if there is one) in the ioend will
- * result in the ioend processing passing on the error if it is
- * possible as we can't return it from here.
- */
- if (ioend->io_type == XFS_IO_OVERWRITE)
- ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
+ if (flags & XFS_DIO_FLAG_UNWRITTEN) {
+ trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-out_end_io:
- xfs_end_io(&ioend->io_work);
- return;
-}
+ error = xfs_iomap_write_unwritten(ip, offset, size);
+ } else if (flags & XFS_DIO_FLAG_APPEND) {
+ struct xfs_trans *tp;
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC void
-xfs_end_io_direct_write(
- struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- struct xfs_ioend *ioend = private;
+ trace_xfs_end_io_direct_write_append(ip, offset, size);
- trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
- ioend ? ioend->io_type : 0, NULL);
-
- if (!ioend) {
- ASSERT(offset + size <= i_size_read(inode));
- return;
+ tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp);
+ return error;
+ }
+ error = xfs_setfilesize(ip, tp, offset, size);
}
- __xfs_end_io_direct_write(inode, ioend, offset, size);
-}
-
-/*
- * For DAX we need a mapping buffer callback for unwritten extent conversion
- * when page faults allocate blocks and then zero them. Note that in this
- * case the mapping indicated by the ioend may extend beyond EOF. We most
- * definitely do not want to extend EOF here, so we trim back the ioend size to
- * EOF.
- */
-#ifdef CONFIG_FS_DAX
-void
-xfs_end_io_dax_write(
- struct buffer_head *bh,
- int uptodate)
-{
- struct xfs_ioend *ioend = bh->b_private;
- struct inode *inode = ioend->io_inode;
- ssize_t size = ioend->io_size;
-
- ASSERT(IS_DAX(ioend->io_inode));
-
- /* if there was an error zeroing, then don't convert it */
- if (!uptodate)
- ioend->io_error = -EIO;
-
- /*
- * Trim update to EOF, so we don't extend EOF during unwritten extent
- * conversion of partial EOF blocks.
- */
- spin_lock(&XFS_I(inode)->i_flags_lock);
- if (ioend->io_offset + size > i_size_read(inode))
- size = i_size_read(inode) - ioend->io_offset;
- spin_unlock(&XFS_I(inode)->i_flags_lock);
-
- __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
-
+ return error;
}
-#else
-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
-#endif
-static inline ssize_t
-xfs_vm_do_dio(
- struct inode *inode,
+STATIC ssize_t
+xfs_vm_direct_IO(
struct kiocb *iocb,
struct iov_iter *iter,
- loff_t offset,
- void (*endio)(struct kiocb *iocb,
- loff_t offset,
- ssize_t size,
- void *private),
- int flags)
+ loff_t offset)
{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ dio_iodone_t *endio = NULL;
+ int flags = 0;
struct block_device *bdev;
- if (IS_DAX(inode))
+ if (iov_iter_rw(iter) == WRITE) {
+ endio = xfs_end_io_direct_write;
+ flags = DIO_ASYNC_EXTEND;
+ }
+
+ if (IS_DAX(inode)) {
return dax_do_io(iocb, inode, iter, offset,
xfs_get_blocks_direct, endio, 0);
+ }
bdev = xfs_find_bdev_for_inode(inode);
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
- xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-STATIC ssize_t
-xfs_vm_direct_IO(
- struct kiocb *iocb,
- struct iov_iter *iter,
- loff_t offset)
-{
- struct inode *inode = iocb->ki_filp->f_mapping->host;
-
- if (iov_iter_rw(iter) == WRITE)
- return xfs_vm_do_dio(inode, iocb, iter, offset,
- xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
- return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
+ xfs_get_blocks_direct, endio, NULL, flags);
}
/*
@@ -1736,9 +1475,10 @@ xfs_vm_write_failed(
loff_t block_offset;
loff_t block_start;
loff_t block_end;
- loff_t from = pos & (PAGE_CACHE_SIZE - 1);
+ loff_t from = pos & (PAGE_SIZE - 1);
loff_t to = from + len;
struct buffer_head *bh, *head;
+ struct xfs_mount *mp = XFS_I(inode)->i_mount;
/*
* The request pos offset might be 32 or 64 bit, this is all fine
@@ -1751,7 +1491,7 @@ xfs_vm_write_failed(
* start of the page by using shifts rather than masks the mismatch
* problem.
*/
- block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+ block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
ASSERT(block_offset + from == pos);
@@ -1770,14 +1510,23 @@ xfs_vm_write_failed(
if (block_start >= to)
break;
- if (!buffer_delay(bh))
+ /*
+ * Process delalloc and unwritten buffers beyond EOF. We can
+ * encounter unwritten buffers in the event that a file has
+ * post-EOF unwritten extents and an extending write happens to
+ * fail (e.g., an unaligned write that also involves a delalloc
+ * to the same page).
+ */
+ if (!buffer_delay(bh) && !buffer_unwritten(bh))
continue;
- if (!buffer_new(bh) && block_offset < i_size_read(inode))
+ if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
+ block_offset < i_size_read(inode))
continue;
- xfs_vm_kill_delalloc_range(inode, block_offset,
- block_offset + bh->b_size);
+ if (buffer_delay(bh))
+ xfs_vm_kill_delalloc_range(inode, block_offset,
+ block_offset + bh->b_size);
/*
* This buffer does not contain data anymore. make sure anyone
@@ -1788,6 +1537,7 @@ xfs_vm_write_failed(
clear_buffer_mapped(bh);
clear_buffer_new(bh);
clear_buffer_dirty(bh);
+ clear_buffer_unwritten(bh);
}
}
@@ -1808,17 +1558,20 @@ xfs_vm_write_begin(
struct page **pagep,
void **fsdata)
{
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int status;
+ struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
- ASSERT(len <= PAGE_CACHE_SIZE);
+ ASSERT(len <= PAGE_SIZE);
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
status = __block_write_begin(page, pos, len, xfs_get_blocks);
+ if (xfs_mp_fail_writes(mp))
+ status = -EIO;
if (unlikely(status)) {
struct inode *inode = mapping->host;
size_t isize = i_size_read(inode);
@@ -1831,13 +1584,15 @@ xfs_vm_write_begin(
* allocated in this write, not blocks that were previously
* written successfully.
*/
+ if (xfs_mp_fail_writes(mp))
+ isize = 0;
if (pos + len > isize) {
ssize_t start = max_t(ssize_t, pos, isize);
truncate_pagecache_range(inode, start, pos + len);
}
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
@@ -1865,7 +1620,7 @@ xfs_vm_write_end(
{
int ret;
- ASSERT(len <= PAGE_CACHE_SIZE);
+ ASSERT(len <= PAGE_SIZE);
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
if (unlikely(ret < len)) {
@@ -1904,6 +1659,7 @@ xfs_vm_readpage(
struct file *unused,
struct page *page)
{
+ trace_xfs_vm_readpage(page->mapping->host, 1);
return mpage_readpage(page, xfs_get_blocks);
}
@@ -1914,6 +1670,7 @@ xfs_vm_readpages(
struct list_head *pages,
unsigned nr_pages)
{
+ trace_xfs_vm_readpages(mapping->host, nr_pages);
return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
}
@@ -1938,7 +1695,6 @@ xfs_vm_set_page_dirty(
loff_t end_offset;
loff_t offset;
int newly_dirty;
- struct mem_cgroup *memcg;
if (unlikely(!mapping))
return !TestSetPageDirty(page);
@@ -1959,10 +1715,10 @@ xfs_vm_set_page_dirty(
} while (bh != head);
}
/*
- * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
- * per-memcg dirty page counters.
+ * Lock out page->mem_cgroup migration to keep PageDirty
+ * synchronized with per-memcg dirty page counters.
*/
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
newly_dirty = !TestSetPageDirty(page);
spin_unlock(&mapping->private_lock);
@@ -1973,13 +1729,13 @@ xfs_vm_set_page_dirty(
spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(!PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 86afd1ac7895..b4421177b68d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,12 +24,14 @@ extern mempool_t *xfs_ioend_pool;
* Types of I/O for bmap clustering and I/O completion tracking.
*/
enum {
+ XFS_IO_INVALID, /* initial state */
XFS_IO_DELALLOC, /* covers delalloc region */
XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
XFS_IO_OVERWRITE, /* covers already allocated extent */
};
#define XFS_IO_TYPES \
+ { XFS_IO_INVALID, "invalid" }, \
{ XFS_IO_DELALLOC, "delalloc" }, \
{ XFS_IO_UNWRITTEN, "unwritten" }, \
{ XFS_IO_OVERWRITE, "overwrite" }
@@ -39,7 +41,7 @@ enum {
* It can manage several multi-page bio's at once.
*/
typedef struct xfs_ioend {
- struct xfs_ioend *io_list; /* next ioend in chain */
+ struct list_head io_list; /* next ioend in chain */
unsigned int io_type; /* delalloc / unwritten */
int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
@@ -58,8 +60,10 @@ int xfs_get_blocks(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
+int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
extern void xfs_count_page_state(struct page *, int *, int *);
+extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 65fb37a18e92..4fa14820e2e2 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -202,8 +202,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
sbp->namelen,
sbp->valuelen,
&sbp->name[sbp->namelen]);
- if (error)
+ if (error) {
+ kmem_free(sbuf);
return error;
+ }
if (context->seen_enough)
break;
cursor->offset++;
@@ -454,14 +456,13 @@ xfs_attr3_leaf_list_int(
args.rmtblkcnt = xfs_attr3_rmt_blocks(
args.dp->i_mount, valuelen);
retval = xfs_attr_rmtval_get(&args);
- if (retval)
- return retval;
- retval = context->put_listent(context,
- entry->flags,
- name_rmt->name,
- (int)name_rmt->namelen,
- valuelen,
- args.value);
+ if (!retval)
+ retval = context->put_listent(context,
+ entry->flags,
+ name_rmt->name,
+ (int)name_rmt->namelen,
+ valuelen,
+ args.value);
kmem_free(args.value);
} else {
retval = context->put_listent(context,
@@ -511,7 +512,7 @@ xfs_attr_list_int(
xfs_inode_t *dp = context->dp;
uint lock_mode;
- XFS_STATS_INC(xs_attr_list);
+ XFS_STATS_INC(dp->i_mount, xs_attr_list);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 3bf4ad0d19e4..3b6309865c65 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -57,37 +57,67 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
}
/*
+ * Routine to zero an extent on disk allocated to the specific inode.
+ *
+ * The VFS functions take a linearised filesystem block offset, so we have to
+ * convert the sparse xfs fsb to the right format first.
+ * VFS types are real funky, too.
+ */
+int
+xfs_zero_extent(
+ struct xfs_inode *ip,
+ xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
+ sector_t block = XFS_BB_TO_FSBT(mp, sector);
+ ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
+
+ if (IS_DAX(VFS_I(ip)))
+ return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
+ sector, size);
+
+ /*
+ * let the block layer decide on the fastest method of
+ * implementing the zeroing.
+ */
+ return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
+
+}
+
+/*
* Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
* caller. Frees all the extents that need freeing, which must be done
* last due to locking considerations. We never free any extents in
* the first transaction.
*
- * Return 1 if the given transaction was committed and a new one
- * started, and 0 otherwise in the committed parameter.
+ * If an inode *ip is provided, rejoin it to the transaction if
+ * the transaction was committed.
*/
int /* error */
xfs_bmap_finish(
struct xfs_trans **tp, /* transaction pointer addr */
struct xfs_bmap_free *flist, /* i/o: list extents to free */
- int *committed)/* xact committed or not */
+ struct xfs_inode *ip)
{
struct xfs_efd_log_item *efd; /* extent free data */
struct xfs_efi_log_item *efi; /* extent free intention */
int error; /* error return value */
+ int committed;/* xact committed or not */
struct xfs_bmap_free_item *free; /* free extent item */
struct xfs_bmap_free_item *next; /* next item on free list */
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
- if (flist->xbf_count == 0) {
- *committed = 0;
+ if (flist->xbf_count == 0)
return 0;
- }
+
efi = xfs_trans_get_efi(*tp, flist->xbf_count);
for (free = flist->xbf_first; free; free = free->xbfi_next)
xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
free->xbfi_blockcount);
- error = __xfs_trans_roll(tp, NULL, committed);
+ error = __xfs_trans_roll(tp, ip, &committed);
if (error) {
/*
* If the transaction was committed, drop the EFD reference
@@ -99,16 +129,13 @@ xfs_bmap_finish(
* transaction so we should return committed=1 even though we're
* returning an error.
*/
- if (*committed) {
+ if (committed) {
xfs_efi_release(efi);
xfs_force_shutdown((*tp)->t_mountp,
(error == -EFSCORRUPTED) ?
SHUTDOWN_CORRUPT_INCORE :
SHUTDOWN_META_IO_ERROR);
- } else {
- *committed = 1;
}
-
return error;
}
@@ -176,10 +203,12 @@ xfs_bmap_rtalloc(
ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
/*
- * Lock out other modifications to the RT bitmap inode.
+ * Lock out modifications to both the RT bitmap and summary inodes
*/
xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
/*
* If it's an allocation to an empty file at offset 0,
@@ -229,6 +258,13 @@ xfs_bmap_rtalloc(
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+
+ /* Zero the extent if we were asked to do so */
+ if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
+ if (error)
+ return error;
+ }
} else {
ap->length = 0;
}
@@ -788,7 +824,7 @@ bool
xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
{
/* prealloc/delalloc exists only on regular files */
- if (!S_ISREG(ip->i_d.di_mode))
+ if (!S_ISREG(VFS_I(ip)->i_mode))
return false;
/*
@@ -933,7 +969,6 @@ xfs_alloc_file_space(
xfs_bmbt_irec_t imaps[1], *imapp;
xfs_bmap_free_t free_list;
uint qblocks, resblks, resrtextents;
- int committed;
int error;
trace_xfs_alloc_file_space(ip);
@@ -1027,24 +1062,21 @@ xfs_alloc_file_space(
xfs_bmap_init(&free_list, &firstfsb);
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, alloc_type, &firstfsb,
- 0, imapp, &nimaps, &free_list);
- if (error) {
+ resblks, imapp, &nimaps, &free_list);
+ if (error)
goto error0;
- }
/*
* Complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
+ if (error)
goto error0;
- }
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error) {
+ if (error)
break;
- }
allocated_fsb = imapp->br_blockcount;
@@ -1170,7 +1202,6 @@ xfs_free_file_space(
xfs_off_t offset,
xfs_off_t len)
{
- int committed;
int done;
xfs_fileoff_t endoffset_fsb;
int error;
@@ -1206,7 +1237,7 @@ xfs_free_file_space(
/* wait for the completion of any pending DIOs */
inode_dio_wait(VFS_I(ip));
- rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+ rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
ioffset = round_down(offset, rounding);
iendoffset = round_up(offset + len, rounding) - 1;
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
@@ -1310,17 +1341,15 @@ xfs_free_file_space(
error = xfs_bunmapi(tp, ip, startoffset_fsb,
endoffset_fsb - startoffset_fsb,
0, 2, &firstfsb, &free_list, &done);
- if (error) {
+ if (error)
goto error0;
- }
/*
* complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
+ if (error)
goto error0;
- }
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1398,7 +1427,6 @@ xfs_shift_file_space(
int error;
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
- int committed;
xfs_fileoff_t stop_fsb;
xfs_fileoff_t next_fsb;
xfs_fileoff_t shift_fsb;
@@ -1438,7 +1466,7 @@ xfs_shift_file_space(
if (error)
return error;
error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- offset >> PAGE_CACHE_SHIFT, -1);
+ offset >> PAGE_SHIFT, -1);
if (error)
return error;
@@ -1490,7 +1518,7 @@ xfs_shift_file_space(
if (error)
goto out_bmap_cancel;
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
@@ -1701,7 +1729,7 @@ xfs_swap_extents(
xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
/* Verify that both files have the same format */
- if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+ if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
error = -EINVAL;
goto out_unlock;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8ecffb35935b..9a2191b91137 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -201,7 +201,7 @@ _xfs_buf_alloc(
atomic_set(&bp->b_pin_count, 0);
init_waitqueue_head(&bp->b_waiters);
- XFS_STATS_INC(xb_create);
+ XFS_STATS_INC(target->bt_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
return bp;
@@ -354,15 +354,16 @@ retry:
*/
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
+ "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+ current->comm, current->pid,
__func__, gfp_mask);
- XFS_STATS_INC(xb_page_retries);
+ XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
- XFS_STATS_INC(xb_page_found);
+ XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found);
nbytes = min_t(size_t, size, PAGE_SIZE - offset);
size -= nbytes;
@@ -516,7 +517,7 @@ _xfs_buf_find(
new_bp->b_pag = pag;
spin_unlock(&pag->pag_buf_lock);
} else {
- XFS_STATS_INC(xb_miss_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
}
@@ -529,11 +530,11 @@ found:
if (!xfs_buf_trylock(bp)) {
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp);
- XFS_STATS_INC(xb_busy_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
return NULL;
}
xfs_buf_lock(bp);
- XFS_STATS_INC(xb_get_locked_waited);
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
}
/*
@@ -549,7 +550,7 @@ found:
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
- XFS_STATS_INC(xb_get_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked);
return bp;
}
@@ -603,7 +604,14 @@ found:
}
}
- XFS_STATS_INC(xb_get);
+ /*
+ * Clear b_error if this is a lookup from a caller that doesn't expect
+ * valid data to be found in the buffer.
+ */
+ if (!(flags & XBF_READ))
+ xfs_buf_ioerror(bp, 0);
+
+ XFS_STATS_INC(target->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
return bp;
}
@@ -642,8 +650,8 @@ xfs_buf_read_map(
if (bp) {
trace_xfs_buf_read(bp, flags, _RET_IP_);
- if (!XFS_BUF_ISDONE(bp)) {
- XFS_STATS_INC(xb_get_read);
+ if (!(bp->b_flags & XBF_DONE)) {
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
bp->b_ops = ops;
_xfs_buf_read(bp, flags);
} else if (flags & XBF_ASYNC) {
@@ -1044,7 +1052,7 @@ xfs_buf_ioend_work(
xfs_buf_ioend(bp);
}
-void
+static void
xfs_buf_ioend_async(
struct xfs_buf *bp)
{
@@ -1519,6 +1527,16 @@ xfs_wait_buftarg(
LIST_HEAD(dispose);
int loop = 0;
+ /*
+ * We need to flush the buffer workqueue to ensure that all IO
+ * completion processing is 100% done. Just waiting on buffer locks is
+ * not sufficient for async IO as the reference count held over IO is
+ * not released until after the buffer lock is dropped. Hence we need to
+ * ensure here that all reference counts have been dropped before we
+ * start walking the LRU list.
+ */
+ drain_workqueue(btp->bt_mount->m_buf_workqueue);
+
/* loop until there is nothing left on the lru list. */
while (list_lru_count(&btp->bt_lru)) {
list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
@@ -1631,13 +1649,9 @@ xfs_setsize_buftarg(
btp->bt_meta_sectormask = sectorsize - 1;
if (set_blocksize(btp->bt_bdev, sectorsize)) {
- char name[BDEVNAME_SIZE];
-
- bdevname(btp->bt_bdev, name);
-
xfs_warn(btp->bt_mount,
- "Cannot set_blocksize to %u on device %s",
- sectorsize, name);
+ "Cannot set_blocksize to %u on device %pg",
+ sectorsize, btp->bt_bdev);
return -EINVAL;
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c79b717d9b88..4eb89bd4ee73 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -132,6 +132,7 @@ struct xfs_buf_map {
struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
struct xfs_buf_ops {
+ char *name;
void (*verify_read)(struct xfs_buf *);
void (*verify_write)(struct xfs_buf *);
};
@@ -301,6 +302,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
/* Buffer Utility Routines */
extern void *xfs_buf_offset(struct xfs_buf *, size_t);
+extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
@@ -311,31 +313,6 @@ extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_init(void);
extern void xfs_buf_terminate(void);
-#define XFS_BUF_ZEROFLAGS(bp) \
- ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
- XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
- XBF_WRITE_FAIL))
-
-void xfs_buf_stale(struct xfs_buf *bp);
-#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
-#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
-
-#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
-#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
-#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
-
-#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC)
-#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC)
-#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC)
-
-#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
-#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
-#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ)
-
-#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE)
-#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
-#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
-
/*
* These macros use the IO block map rather than b_bn. b_bn is now really
* just for the buffer cache index for cached buffers. As IO does not use b_bn
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7e986da34f6c..99e91a0e554e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -431,7 +431,7 @@ xfs_buf_item_unpin(
if (freed && stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(xfs_buf_islocked(bp));
- ASSERT(XFS_BUF_ISSTALE(bp));
+ ASSERT(bp->b_flags & XBF_STALE);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
trace_xfs_buf_item_unpin_stale(bip);
@@ -493,7 +493,7 @@ xfs_buf_item_unpin(
xfs_buf_hold(bp);
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioerror(bp, -EIO);
- XFS_BUF_UNDONE(bp);
+ bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
xfs_buf_ioend(bp);
}
@@ -1067,7 +1067,7 @@ xfs_buf_iodone_callbacks(
*/
if (XFS_FORCED_SHUTDOWN(mp)) {
xfs_buf_stale(bp);
- XFS_BUF_DONE(bp);
+ bp->b_flags |= XBF_DONE;
trace_xfs_buf_item_iodone(bp, _RET_IP_);
goto do_callbacks;
}
@@ -1090,7 +1090,7 @@ xfs_buf_iodone_callbacks(
* errors tend to affect the whole device and a failing log write
* will make us give up. But we really ought to do better here.
*/
- if (XFS_BUF_ISASYNC(bp)) {
+ if (bp->b_flags & XBF_ASYNC) {
ASSERT(bp->b_iodone != NULL);
trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
@@ -1113,7 +1113,7 @@ xfs_buf_iodone_callbacks(
* sure to return the error to the caller of xfs_bwrite().
*/
xfs_buf_stale(bp);
- XFS_BUF_DONE(bp);
+ bp->b_flags |= XBF_DONE;
trace_xfs_buf_error_relse(bp, _RET_IP_);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index a989a9c7edb7..93b3ab0c5435 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -665,8 +665,8 @@ xfs_readdir(
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_getdents);
+ ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
+ XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index e85a9519a5ae..272c3f8b6f7d 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -227,7 +227,7 @@ xfs_discard_extents(
GFP_NOFS, 0);
if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
- "discard failed for extent [0x%llu,%u], error %d",
+ "discard failed for extent [0x%llx,%u], error %d",
(unsigned long long)busyp->bno,
busyp->length,
error);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 30cb3afb67f0..316b2a1bdba5 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -75,9 +75,9 @@ xfs_qm_dqdestroy(
ASSERT(list_empty(&dqp->q_lru));
mutex_destroy(&dqp->q_qlock);
- kmem_zone_free(xfs_qm_dqzone, dqp);
- XFS_STATS_DEC(xs_qm_dquot);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
+ kmem_zone_free(xfs_qm_dqzone, dqp);
}
/*
@@ -92,26 +92,28 @@ xfs_qm_adjust_dqlimits(
{
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_disk_dquot *d = &dq->q_core;
+ struct xfs_def_quota *defq;
int prealloc = 0;
ASSERT(d->d_id);
+ defq = xfs_get_defquota(dq, q);
- if (q->qi_bsoftlimit && !d->d_blk_softlimit) {
- d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
+ if (defq->bsoftlimit && !d->d_blk_softlimit) {
+ d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit);
prealloc = 1;
}
- if (q->qi_bhardlimit && !d->d_blk_hardlimit) {
- d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+ if (defq->bhardlimit && !d->d_blk_hardlimit) {
+ d->d_blk_hardlimit = cpu_to_be64(defq->bhardlimit);
prealloc = 1;
}
- if (q->qi_isoftlimit && !d->d_ino_softlimit)
- d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
- if (q->qi_ihardlimit && !d->d_ino_hardlimit)
- d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
- if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
- d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
- if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
- d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+ if (defq->isoftlimit && !d->d_ino_softlimit)
+ d->d_ino_softlimit = cpu_to_be64(defq->isoftlimit);
+ if (defq->ihardlimit && !d->d_ino_hardlimit)
+ d->d_ino_hardlimit = cpu_to_be64(defq->ihardlimit);
+ if (defq->rtbsoftlimit && !d->d_rtb_softlimit)
+ d->d_rtb_softlimit = cpu_to_be64(defq->rtbsoftlimit);
+ if (defq->rtbhardlimit && !d->d_rtb_hardlimit)
+ d->d_rtb_hardlimit = cpu_to_be64(defq->rtbhardlimit);
if (prealloc)
xfs_dquot_set_prealloc_limits(dq);
@@ -232,7 +234,8 @@ xfs_qm_init_dquot_blk(
{
struct xfs_quotainfo *q = mp->m_quotainfo;
xfs_dqblk_t *d;
- int curid, i;
+ xfs_dqid_t curid;
+ int i;
ASSERT(tp);
ASSERT(xfs_buf_islocked(bp));
@@ -243,7 +246,6 @@ xfs_qm_init_dquot_blk(
* ID of the first dquot in the block - id's are zero based.
*/
curid = id - (id % q->qi_dqperchunk);
- ASSERT(curid >= 0);
memset(d, 0, BBTOB(q->qi_dqchunklen));
for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) {
d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
@@ -306,7 +308,7 @@ xfs_qm_dqalloc(
xfs_fsblock_t firstblock;
xfs_bmap_free_t flist;
xfs_bmbt_irec_t map;
- int nmaps, error, committed;
+ int nmaps, error;
xfs_buf_t *bp;
xfs_trans_t *tp = *tpp;
@@ -379,11 +381,12 @@ xfs_qm_dqalloc(
xfs_trans_bhold(tp, bp);
- if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
+ error = xfs_bmap_finish(tpp, &flist, NULL);
+ if (error)
goto error1;
- }
- if (committed) {
+ /* Transaction was committed? */
+ if (*tpp != tp) {
tp = *tpp;
xfs_trans_bjoin(tp, bp);
} else {
@@ -393,9 +396,9 @@ xfs_qm_dqalloc(
*O_bpp = bp;
return 0;
- error1:
+error1:
xfs_bmap_cancel(&flist);
- error0:
+error0:
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
return error;
@@ -463,12 +466,13 @@ xfs_qm_dqtobp(
struct xfs_bmbt_irec map;
int nmaps = 1, error;
struct xfs_buf *bp;
- struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp);
+ struct xfs_inode *quotip;
struct xfs_mount *mp = dqp->q_mount;
xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
struct xfs_trans *tp = (tpp ? *tpp : NULL);
uint lock_mode;
+ quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags);
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
lock_mode = xfs_ilock_data_map_shared(quotip);
@@ -605,7 +609,7 @@ xfs_qm_dqread(
break;
}
- XFS_STATS_INC(xs_qm_dquot);
+ XFS_STATS_INC(mp, xs_qm_dquot);
trace_xfs_dqread(dqp);
@@ -684,6 +688,56 @@ error0:
}
/*
+ * Advance to the next id in the current chunk, or if at the
+ * end of the chunk, skip ahead to first id in next allocated chunk
+ * using the SEEK_DATA interface.
+ */
+int
+xfs_dq_get_next_id(
+ xfs_mount_t *mp,
+ uint type,
+ xfs_dqid_t *id,
+ loff_t eof)
+{
+ struct xfs_inode *quotip;
+ xfs_fsblock_t start;
+ loff_t offset;
+ uint lock;
+ xfs_dqid_t next_id;
+ int error = 0;
+
+ /* Simple advance */
+ next_id = *id + 1;
+
+ /* If new ID is within the current chunk, advancing it sufficed */
+ if (next_id % mp->m_quotainfo->qi_dqperchunk) {
+ *id = next_id;
+ return 0;
+ }
+
+ /* Nope, next_id is now past the current chunk, so find the next one */
+ start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk;
+
+ quotip = xfs_quota_inode(mp, type);
+ lock = xfs_ilock_data_map_shared(quotip);
+
+ offset = __xfs_seek_hole_data(VFS_I(quotip), XFS_FSB_TO_B(mp, start),
+ eof, SEEK_DATA);
+ if (offset < 0)
+ error = offset;
+
+ xfs_iunlock(quotip, lock);
+
+ /* -ENXIO is essentially "no more data" */
+ if (error)
+ return (error == -ENXIO ? -ENOENT: error);
+
+ /* Convert next data offset back to a quota id */
+ *id = XFS_B_TO_FSB(mp, offset) * mp->m_quotainfo->qi_dqperchunk;
+ return 0;
+}
+
+/*
* Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
* a locked dquot, doing an allocation (if requested) as needed.
* When both an inode and an id are given, the inode's id takes precedence.
@@ -703,6 +757,7 @@ xfs_qm_dqget(
struct xfs_quotainfo *qi = mp->m_quotainfo;
struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
+ loff_t eof = 0;
int error;
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -730,6 +785,21 @@ xfs_qm_dqget(
}
#endif
+ /* Get the end of the quota file if we need it */
+ if (flags & XFS_QMOPT_DQNEXT) {
+ struct xfs_inode *quotip;
+ xfs_fileoff_t last;
+ uint lock_mode;
+
+ quotip = xfs_quota_inode(mp, type);
+ lock_mode = xfs_ilock_data_map_shared(quotip);
+ error = xfs_bmap_last_offset(quotip, &last, XFS_DATA_FORK);
+ xfs_iunlock(quotip, lock_mode);
+ if (error)
+ return error;
+ eof = XFS_FSB_TO_B(mp, last);
+ }
+
restart:
mutex_lock(&qi->qi_tree_lock);
dqp = radix_tree_lookup(tree, id);
@@ -743,16 +813,28 @@ restart:
goto restart;
}
+ /* uninit / unused quota found in radix tree, keep looking */
+ if (flags & XFS_QMOPT_DQNEXT) {
+ if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ xfs_dqunlock(dqp);
+ mutex_unlock(&qi->qi_tree_lock);
+ error = xfs_dq_get_next_id(mp, type, &id, eof);
+ if (error)
+ return error;
+ goto restart;
+ }
+ }
+
dqp->q_nrefs++;
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_hit(dqp);
- XFS_STATS_INC(xs_qm_dqcachehits);
+ XFS_STATS_INC(mp, xs_qm_dqcachehits);
*O_dqpp = dqp;
return 0;
}
mutex_unlock(&qi->qi_tree_lock);
- XFS_STATS_INC(xs_qm_dqcachemisses);
+ XFS_STATS_INC(mp, xs_qm_dqcachemisses);
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -769,6 +851,13 @@ restart:
if (ip)
xfs_ilock(ip, XFS_ILOCK_EXCL);
+ /* If we are asked to find next active id, keep looking */
+ if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) {
+ error = xfs_dq_get_next_id(mp, type, &id, eof);
+ if (!error)
+ goto restart;
+ }
+
if (error)
return error;
@@ -806,7 +895,7 @@ restart:
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_dup(dqp);
xfs_qm_dqdestroy(dqp);
- XFS_STATS_INC(xs_qm_dquot_dups);
+ XFS_STATS_INC(mp, xs_qm_dquot_dups);
goto restart;
}
@@ -819,6 +908,17 @@ restart:
qi->qi_dquots++;
mutex_unlock(&qi->qi_tree_lock);
+ /* If we are asked to find next active id, keep looking */
+ if (flags & XFS_QMOPT_DQNEXT) {
+ if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ xfs_qm_dqput(dqp);
+ error = xfs_dq_get_next_id(mp, type, &id, eof);
+ if (error)
+ return error;
+ goto restart;
+ }
+ }
+
dqret:
ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
trace_xfs_dqget_miss(dqp);
@@ -846,7 +946,7 @@ xfs_qm_dqput(
trace_xfs_dqput_free(dqp);
if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
- XFS_STATS_INC(xs_qm_dquot_unused);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
}
xfs_dqunlock(dqp);
}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 74d0e5966ebc..88693a98fac5 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -164,9 +164,9 @@ xfs_verifier_error(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
+ xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
- __return_address, bp->b_bn);
+ __return_address, bp->b_ops->name, bp->b_bn);
xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 652cd3c5b58c..a1b2dd828b9d 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -152,7 +152,7 @@ xfs_nfs_get_inode(
return ERR_PTR(error);
}
- if (ip->i_d.di_gen != generation) {
+ if (VFS_I(ip)->i_generation != generation) {
IRELE(ip);
return ERR_PTR(-ESTALE);
}
@@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = {
.fh_to_parent = xfs_fs_fh_to_parent,
.get_parent = xfs_fs_get_parent,
.commit_metadata = xfs_fs_nfs_commit_metadata,
-#ifdef CONFIG_NFSD_PNFS
+#ifdef CONFIG_NFSD_BLOCKLAYOUT
.get_uuid = xfs_fs_get_uuid,
.map_blocks = xfs_fs_map_blocks,
.commit_blocks = xfs_fs_commit_blocks,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e78feb400e22..569938a4a357 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -55,7 +55,7 @@ xfs_rw_ilock(
int type)
{
if (type & XFS_IOLOCK_EXCL)
- mutex_lock(&VFS_I(ip)->i_mutex);
+ inode_lock(VFS_I(ip));
xfs_ilock(ip, type);
}
@@ -66,7 +66,7 @@ xfs_rw_iunlock(
{
xfs_iunlock(ip, type);
if (type & XFS_IOLOCK_EXCL)
- mutex_unlock(&VFS_I(ip)->i_mutex);
+ inode_unlock(VFS_I(ip));
}
static inline void
@@ -76,7 +76,7 @@ xfs_rw_ilock_demote(
{
xfs_ilock_demote(ip, type);
if (type & XFS_IOLOCK_EXCL)
- mutex_unlock(&VFS_I(ip)->i_mutex);
+ inode_unlock(VFS_I(ip));
}
/*
@@ -106,8 +106,8 @@ xfs_iozero(
unsigned offset, bytes;
void *fsdata;
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- bytes = PAGE_CACHE_SIZE - offset;
+ offset = (pos & (PAGE_SIZE -1)); /* Within page */
+ bytes = PAGE_SIZE - offset;
if (bytes > count)
bytes = count;
@@ -156,9 +156,9 @@ xfs_update_prealloc_flags(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
if (!(flags & XFS_PREALLOC_INVISIBLE)) {
- ip->i_d.di_mode &= ~S_ISUID;
- if (ip->i_d.di_mode & S_IXGRP)
- ip->i_d.di_mode &= ~S_ISGID;
+ VFS_I(ip)->i_mode &= ~S_ISUID;
+ if (VFS_I(ip)->i_mode & S_IXGRP)
+ VFS_I(ip)->i_mode &= ~S_ISGID;
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
}
@@ -242,19 +242,30 @@ xfs_file_fsync(
}
/*
- * All metadata updates are logged, which means that we just have
- * to flush the log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to
+ * flush the log up to the latest LSN that touched the inode. If we have
+ * concurrent fsync/fdatasync() calls, we need them to all block on the
+ * log force before we clear the ili_fsync_fields field. This ensures
+ * that we don't get a racing sync operation that does not wait for the
+ * metadata to hit the journal before returning. If we race with
+ * clearing the ili_fsync_fields, then all that will happen is the log
+ * force will do nothing as the lsn will already be on disk. We can't
+ * race with setting ili_fsync_fields because that is done under
+ * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+ * until after the ili_fsync_fields is cleared.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip)) {
if (!datasync ||
- (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
lsn = ip->i_itemp->ili_last_lsn;
}
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (lsn)
+ if (lsn) {
error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+ ip->i_itemp->ili_fsync_fields = 0;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
/*
* If we only have a single device, and the log force about was
@@ -287,7 +298,7 @@ xfs_file_read_iter(
xfs_fsize_t n;
loff_t pos = iocb->ki_pos;
- XFS_STATS_INC(xs_read_calls);
+ XFS_STATS_INC(mp, xs_read_calls);
if (unlikely(iocb->ki_flags & IOCB_DIRECT))
ioflags |= XFS_IO_ISDIRECT;
@@ -365,7 +376,7 @@ xfs_file_read_iter(
ret = generic_file_read_iter(iocb, to);
if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ XFS_STATS_ADD(mp, xs_read_bytes, ret);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -383,7 +394,7 @@ xfs_file_splice_read(
int ioflags = 0;
ssize_t ret;
- XFS_STATS_INC(xs_read_calls);
+ XFS_STATS_INC(ip->i_mount, xs_read_calls);
if (infilp->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
@@ -391,19 +402,26 @@ xfs_file_splice_read(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
- /* for dax, we need to avoid the page cache */
- if (IS_DAX(VFS_I(ip)))
- ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
- else
- ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ /*
+ * DAX inodes cannot ues the page cache for splice, so we have to push
+ * them through the VFS IO path. This means it goes through
+ * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we
+ * cannot lock the splice operation at this level for DAX inodes.
+ */
+ if (IS_DAX(VFS_I(ip))) {
+ ret = default_file_splice_read(infilp, ppos, pipe, count,
+ flags);
+ goto out;
+ }
+ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+out:
+ if (ret > 0)
+ XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
return ret;
}
@@ -482,6 +500,8 @@ xfs_zero_eof(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
+ trace_xfs_zero_eof(ip, isize, offset - isize);
+
/*
* First handle zeroing the block on which isize resides.
*
@@ -574,6 +594,7 @@ xfs_file_aio_write_checks(
struct xfs_inode *ip = XFS_I(inode);
ssize_t error = 0;
size_t count = iov_iter_count(from);
+ bool drained_dio = false;
restart:
error = generic_write_checks(iocb, from);
@@ -611,12 +632,13 @@ restart:
bool zero = false;
spin_unlock(&ip->i_flags_lock);
- if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
- iov_iter_reexpand(from, count);
-
+ if (!drained_dio) {
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+ }
/*
* We now have an IO submission barrier in place, but
* AIO can do EOF updates during IO completion and hence
@@ -626,6 +648,7 @@ restart:
* no-op.
*/
inode_dio_wait(inode);
+ drained_dio = true;
goto restart;
}
error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
@@ -776,8 +799,8 @@ xfs_file_dio_aio_write(
/* see generic_file_direct_write() for why this is necessary */
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT,
- end >> PAGE_CACHE_SHIFT);
+ pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
}
if (ret > 0) {
@@ -867,7 +890,7 @@ xfs_file_write_iter(
ssize_t ret;
size_t ocount = iov_iter_count(from);
- XFS_STATS_INC(xs_write_calls);
+ XFS_STATS_INC(ip->i_mount, xs_write_calls);
if (ocount == 0)
return 0;
@@ -883,7 +906,7 @@ xfs_file_write_iter(
if (ret > 0) {
ssize_t err;
- XFS_STATS_ADD(xs_write_bytes, ret);
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
/* Handle various SYNC-type writes */
err = generic_write_sync(file, iocb->ki_pos - ret, ret);
@@ -1184,9 +1207,9 @@ xfs_find_get_desired_pgoff(
pagevec_init(&pvec, 0);
- index = startoff >> PAGE_CACHE_SHIFT;
+ index = startoff >> PAGE_SHIFT;
endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
- end = endoff >> PAGE_CACHE_SHIFT;
+ end = endoff >> PAGE_SHIFT;
do {
int want;
unsigned nr_pages;
@@ -1314,31 +1337,31 @@ out:
return found;
}
-STATIC loff_t
-xfs_seek_hole_data(
- struct file *file,
+/*
+ * caller must lock inode with xfs_ilock_data_map_shared,
+ * can we craft an appropriate ASSERT?
+ *
+ * end is because the VFS-level lseek interface is defined such that any
+ * offset past i_size shall return -ENXIO, but we use this for quota code
+ * which does not maintain i_size, and we want to SEEK_DATA past i_size.
+ */
+loff_t
+__xfs_seek_hole_data(
+ struct inode *inode,
loff_t start,
+ loff_t end,
int whence)
{
- struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
loff_t uninitialized_var(offset);
- xfs_fsize_t isize;
xfs_fileoff_t fsbno;
- xfs_filblks_t end;
- uint lock;
+ xfs_filblks_t lastbno;
int error;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- lock = xfs_ilock_data_map_shared(ip);
-
- isize = i_size_read(inode);
- if (start >= isize) {
+ if (start >= end) {
error = -ENXIO;
- goto out_unlock;
+ goto out_error;
}
/*
@@ -1346,22 +1369,22 @@ xfs_seek_hole_data(
* by fsbno to the end block of the file.
*/
fsbno = XFS_B_TO_FSBT(mp, start);
- end = XFS_B_TO_FSB(mp, isize);
+ lastbno = XFS_B_TO_FSB(mp, end);
for (;;) {
struct xfs_bmbt_irec map[2];
int nmap = 2;
unsigned int i;
- error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
+ error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap,
XFS_BMAPI_ENTIRE);
if (error)
- goto out_unlock;
+ goto out_error;
/* No extents at given offset, must be beyond EOF */
if (nmap == 0) {
error = -ENXIO;
- goto out_unlock;
+ goto out_error;
}
for (i = 0; i < nmap; i++) {
@@ -1403,7 +1426,7 @@ xfs_seek_hole_data(
* hole at the end of any file).
*/
if (whence == SEEK_HOLE) {
- offset = isize;
+ offset = end;
break;
}
/*
@@ -1411,7 +1434,7 @@ xfs_seek_hole_data(
*/
ASSERT(whence == SEEK_DATA);
error = -ENXIO;
- goto out_unlock;
+ goto out_error;
}
ASSERT(i > 1);
@@ -1422,14 +1445,14 @@ xfs_seek_hole_data(
*/
fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
start = XFS_FSB_TO_B(mp, fsbno);
- if (start >= isize) {
+ if (start >= end) {
if (whence == SEEK_HOLE) {
- offset = isize;
+ offset = end;
break;
}
ASSERT(whence == SEEK_DATA);
error = -ENXIO;
- goto out_unlock;
+ goto out_error;
}
}
@@ -1441,7 +1464,39 @@ out:
* situation in particular.
*/
if (whence == SEEK_HOLE)
- offset = min_t(loff_t, offset, isize);
+ offset = min_t(loff_t, offset, end);
+
+ return offset;
+
+out_error:
+ return error;
+}
+
+STATIC loff_t
+xfs_seek_hole_data(
+ struct file *file,
+ loff_t start,
+ int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ uint lock;
+ loff_t offset, end;
+ int error = 0;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ lock = xfs_ilock_data_map_shared(ip);
+
+ end = i_size_read(inode);
+ offset = __xfs_seek_hole_data(inode, start, end, whence);
+ if (offset < 0) {
+ error = offset;
+ goto out_unlock;
+ }
+
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out_unlock:
@@ -1477,7 +1532,7 @@ xfs_file_llseek(
*
* mmap_sem (MM)
* sb_start_pagefault(vfs, freeze)
- * i_mmap_lock (XFS - truncate serialisation)
+ * i_mmaplock (XFS - truncate serialisation)
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
@@ -1503,10 +1558,9 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
- xfs_end_io_dax_write);
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else {
- ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = block_page_mkwrite_return(ret);
}
@@ -1538,7 +1592,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+ ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1546,6 +1600,13 @@ xfs_filemap_fault(
return ret;
}
+/*
+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+ * both read and write faults. Hence we need to handle both cases. There is no
+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * handle both cases here. @flags carries the information on the type of fault
+ * occuring.
+ */
STATIC int
xfs_filemap_pmd_fault(
struct vm_area_struct *vma,
@@ -1562,15 +1623,55 @@ xfs_filemap_pmd_fault(
trace_xfs_filemap_pmd_fault(ip);
- sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
- xfs_end_io_dax_write);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
+ NULL);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- sb_end_pagefault(inode->i_sb);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
+/*
+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+ * updates on write faults. In reality, it's need to serialise against
+ * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
+ * to ensure we serialise the fault barrier in place.
+ */
+static int
+xfs_filemap_pfn_mkwrite(
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret = VM_FAULT_NOPAGE;
+ loff_t size;
+
+ trace_xfs_filemap_pfn_mkwrite(ip);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+
+ /* check if the faulting page hasn't raced with truncate */
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+ else if (IS_DAX(inode))
+ ret = dax_pfn_mkwrite(vma, vmf);
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
return ret;
+
}
static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1578,6 +1679,7 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
.pmd_fault = xfs_filemap_pmd_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
+ .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};
STATIC int
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index c4c130f9bfb6..a51353a1f87f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -151,7 +151,7 @@ xfs_filestream_pick_ag(
xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
int err, trylock, nscan;
- ASSERT(S_ISDIR(ip->i_d.di_mode));
+ ASSERT(S_ISDIR(VFS_I(ip)->i_mode));
/* 2% of an AG's blocks must be free for it to be chosen. */
minfree = mp->m_sb.sb_agblocks / 50;
@@ -319,7 +319,7 @@ xfs_filestream_lookup_ag(
xfs_agnumber_t startag, ag = NULLAGNUMBER;
struct xfs_mru_cache_elem *mru;
- ASSERT(S_ISREG(ip->i_d.di_mode));
+ ASSERT(S_ISREG(VFS_I(ip)->i_mode));
pip = xfs_filestream_get_parent(ip);
if (!pip)
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 1b6a98b66886..f32713f14f9a 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,5 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
xfs_fsop_resblks_t *outval);
extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(struct xfs_mount *mp);
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0a326bd64d4e..bf2d60749278 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -63,7 +63,10 @@ xfs_inode_alloc(
return NULL;
}
- XFS_STATS_INC(vn_active);
+ /* VFS doesn't initialise i_mode! */
+ VFS_I(ip)->i_mode = 0;
+
+ XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(!xfs_isiflocked(ip));
@@ -79,7 +82,7 @@ xfs_inode_alloc(
memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
ip->i_flags = 0;
ip->i_delayed_blks = 0;
- memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+ memset(&ip->i_d, 0, sizeof(ip->i_d));
return ip;
}
@@ -98,7 +101,7 @@ void
xfs_inode_free(
struct xfs_inode *ip)
{
- switch (ip->i_d.di_mode & S_IFMT) {
+ switch (VFS_I(ip)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFDIR:
case S_IFLNK:
@@ -129,12 +132,40 @@ xfs_inode_free(
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!xfs_isiflocked(ip));
- XFS_STATS_DEC(vn_active);
+ XFS_STATS_DEC(ip->i_mount, vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
}
/*
+ * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
+ * part of the structure. This is made more complex by the fact we store
+ * information about the on-disk values in the VFS inode and so we can't just
+ * overwrite the values unconditionally. Hence we save the parameters we
+ * need to retain across reinitialisation, and rewrite them into the VFS inode
+ * after reinitialisation even if it fails.
+ */
+static int
+xfs_reinit_inode(
+ struct xfs_mount *mp,
+ struct inode *inode)
+{
+ int error;
+ uint32_t nlink = inode->i_nlink;
+ uint32_t generation = inode->i_generation;
+ uint64_t version = inode->i_version;
+ umode_t mode = inode->i_mode;
+
+ error = inode_init_always(mp->m_super, inode);
+
+ set_nlink(inode, nlink);
+ inode->i_generation = generation;
+ inode->i_version = version;
+ inode->i_mode = mode;
+ return error;
+}
+
+/*
* Check the validity of the inode we just found it the cache
*/
static int
@@ -159,7 +190,7 @@ xfs_iget_cache_hit(
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != ino) {
trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
error = -EAGAIN;
goto out_error;
}
@@ -177,7 +208,7 @@ xfs_iget_cache_hit(
*/
if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
error = -EAGAIN;
goto out_error;
}
@@ -185,7 +216,7 @@ xfs_iget_cache_hit(
/*
* If lookup is racing with unlink return an error immediately.
*/
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
error = -ENOENT;
goto out_error;
}
@@ -208,7 +239,7 @@ xfs_iget_cache_hit(
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
- error = inode_init_always(mp->m_super, inode);
+ error = xfs_reinit_inode(mp, inode);
if (error) {
/*
* Re-initializing the inode failed, and we are in deep
@@ -259,7 +290,7 @@ xfs_iget_cache_hit(
xfs_ilock(ip, lock_flags);
xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
- XFS_STATS_INC(xs_ig_found);
+ XFS_STATS_INC(mp, xs_ig_found);
return 0;
@@ -295,7 +326,7 @@ xfs_iget_cache_miss(
trace_xfs_iget_miss(ip);
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+ if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) {
error = -ENOENT;
goto out_destroy;
}
@@ -342,7 +373,7 @@ xfs_iget_cache_miss(
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
if (unlikely(error)) {
WARN_ON(error != -EEXIST);
- XFS_STATS_INC(xs_ig_dup);
+ XFS_STATS_INC(mp, xs_ig_dup);
error = -EAGAIN;
goto out_preload_end;
}
@@ -412,7 +443,7 @@ xfs_iget(
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return -EINVAL;
- XFS_STATS_INC(xs_ig_attempts);
+ XFS_STATS_INC(mp, xs_ig_attempts);
/* get the perag structure and ensure that it's inode capable */
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
@@ -429,7 +460,7 @@ again:
goto out_error_or_again;
} else {
rcu_read_unlock();
- XFS_STATS_INC(xs_ig_missed);
+ XFS_STATS_INC(mp, xs_ig_missed);
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
flags, lock_flags);
@@ -444,7 +475,7 @@ again:
* If we have a real type for an on-disk inode, we can setup the inode
* now. If it's a new inode being created, xfs_ialloc will handle it.
*/
- if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+ if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
xfs_setup_existing_inode(ip);
return 0;
@@ -965,7 +996,7 @@ reclaim:
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- XFS_STATS_INC(xs_ig_reclaims);
+ XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
/*
* Remove the inode from the per-AG radix tree.
*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dc40a6d5ae0d..96f606deee31 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -57,9 +57,9 @@ kmem_zone_t *xfs_inode_zone;
*/
#define XFS_ITRUNC_MAX_EXTENTS 2
-STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
-
-STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
+STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
+STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
+STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
/*
* helper function to extract extent size hint from inode
@@ -610,60 +610,69 @@ __xfs_iflock(
STATIC uint
_xfs_dic2xflags(
- __uint16_t di_flags)
+ __uint16_t di_flags,
+ uint64_t di_flags2,
+ bool has_attr)
{
uint flags = 0;
if (di_flags & XFS_DIFLAG_ANY) {
if (di_flags & XFS_DIFLAG_REALTIME)
- flags |= XFS_XFLAG_REALTIME;
+ flags |= FS_XFLAG_REALTIME;
if (di_flags & XFS_DIFLAG_PREALLOC)
- flags |= XFS_XFLAG_PREALLOC;
+ flags |= FS_XFLAG_PREALLOC;
if (di_flags & XFS_DIFLAG_IMMUTABLE)
- flags |= XFS_XFLAG_IMMUTABLE;
+ flags |= FS_XFLAG_IMMUTABLE;
if (di_flags & XFS_DIFLAG_APPEND)
- flags |= XFS_XFLAG_APPEND;
+ flags |= FS_XFLAG_APPEND;
if (di_flags & XFS_DIFLAG_SYNC)
- flags |= XFS_XFLAG_SYNC;
+ flags |= FS_XFLAG_SYNC;
if (di_flags & XFS_DIFLAG_NOATIME)
- flags |= XFS_XFLAG_NOATIME;
+ flags |= FS_XFLAG_NOATIME;
if (di_flags & XFS_DIFLAG_NODUMP)
- flags |= XFS_XFLAG_NODUMP;
+ flags |= FS_XFLAG_NODUMP;
if (di_flags & XFS_DIFLAG_RTINHERIT)
- flags |= XFS_XFLAG_RTINHERIT;
+ flags |= FS_XFLAG_RTINHERIT;
if (di_flags & XFS_DIFLAG_PROJINHERIT)
- flags |= XFS_XFLAG_PROJINHERIT;
+ flags |= FS_XFLAG_PROJINHERIT;
if (di_flags & XFS_DIFLAG_NOSYMLINKS)
- flags |= XFS_XFLAG_NOSYMLINKS;
+ flags |= FS_XFLAG_NOSYMLINKS;
if (di_flags & XFS_DIFLAG_EXTSIZE)
- flags |= XFS_XFLAG_EXTSIZE;
+ flags |= FS_XFLAG_EXTSIZE;
if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
- flags |= XFS_XFLAG_EXTSZINHERIT;
+ flags |= FS_XFLAG_EXTSZINHERIT;
if (di_flags & XFS_DIFLAG_NODEFRAG)
- flags |= XFS_XFLAG_NODEFRAG;
+ flags |= FS_XFLAG_NODEFRAG;
if (di_flags & XFS_DIFLAG_FILESTREAM)
- flags |= XFS_XFLAG_FILESTREAM;
+ flags |= FS_XFLAG_FILESTREAM;
+ }
+
+ if (di_flags2 & XFS_DIFLAG2_ANY) {
+ if (di_flags2 & XFS_DIFLAG2_DAX)
+ flags |= FS_XFLAG_DAX;
}
+ if (has_attr)
+ flags |= FS_XFLAG_HASATTR;
+
return flags;
}
uint
xfs_ip2xflags(
- xfs_inode_t *ip)
+ struct xfs_inode *ip)
{
- xfs_icdinode_t *dic = &ip->i_d;
+ struct xfs_icdinode *dic = &ip->i_d;
- return _xfs_dic2xflags(dic->di_flags) |
- (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
+ return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
}
uint
xfs_dic2xflags(
- xfs_dinode_t *dip)
+ struct xfs_dinode *dip)
{
- return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
- (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
+ return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
+ be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
}
/*
@@ -757,6 +766,7 @@ xfs_ialloc(
uint flags;
int error;
struct timespec tv;
+ struct inode *inode;
/*
* Call the space management code to pick
@@ -782,6 +792,7 @@ xfs_ialloc(
if (error)
return error;
ASSERT(ip != NULL);
+ inode = VFS_I(ip);
/*
* We always convert v1 inodes to v2 now - we only support filesystems
@@ -791,20 +802,16 @@ xfs_ialloc(
if (ip->i_d.di_version == 1)
ip->i_d.di_version = 2;
- ip->i_d.di_mode = mode;
- ip->i_d.di_onlink = 0;
- ip->i_d.di_nlink = nlink;
- ASSERT(ip->i_d.di_nlink == nlink);
+ inode->i_mode = mode;
+ set_nlink(inode, nlink);
ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
xfs_set_projid(ip, prid);
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
if (pip && XFS_INHERIT_GID(pip)) {
ip->i_d.di_gid = pip->i_d.di_gid;
- if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
- ip->i_d.di_mode |= S_ISGID;
- }
+ if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
+ inode->i_mode |= S_ISGID;
}
/*
@@ -813,38 +820,29 @@ xfs_ialloc(
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
if ((irix_sgid_inherit) &&
- (ip->i_d.di_mode & S_ISGID) &&
- (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
- ip->i_d.di_mode &= ~S_ISGID;
- }
+ (inode->i_mode & S_ISGID) &&
+ (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
+ inode->i_mode &= ~S_ISGID;
ip->i_d.di_size = 0;
ip->i_d.di_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
tv = current_fs_time(mp->m_super);
- ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
- ip->i_d.di_atime = ip->i_d.di_mtime;
- ip->i_d.di_ctime = ip->i_d.di_mtime;
+ inode->i_mtime = tv;
+ inode->i_atime = tv;
+ inode->i_ctime = tv;
- /*
- * di_gen will have been taken care of in xfs_iread.
- */
ip->i_d.di_extsize = 0;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_dmstate = 0;
ip->i_d.di_flags = 0;
if (ip->i_d.di_version == 3) {
- ASSERT(ip->i_d.di_ino == ino);
- ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid));
- ip->i_d.di_crc = 0;
- ip->i_d.di_changecount = 1;
- ip->i_d.di_lsn = 0;
+ inode->i_version = 1;
ip->i_d.di_flags2 = 0;
- memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
- ip->i_d.di_crtime = ip->i_d.di_mtime;
+ ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
+ ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
}
@@ -862,7 +860,8 @@ xfs_ialloc(
case S_IFREG:
case S_IFDIR:
if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
- uint di_flags = 0;
+ uint64_t di_flags2 = 0;
+ uint di_flags = 0;
if (S_ISDIR(mode)) {
if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
@@ -898,7 +897,11 @@ xfs_ialloc(
di_flags |= XFS_DIFLAG_NODEFRAG;
if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
di_flags |= XFS_DIFLAG_FILESTREAM;
+ if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+ di_flags2 |= XFS_DIFLAG2_DAX;
+
ip->i_d.di_flags |= di_flags;
+ ip->i_d.di_flags2 |= di_flags2;
}
/* FALLTHROUGH */
case S_IFLNK:
@@ -1078,35 +1081,24 @@ xfs_dir_ialloc(
}
/*
- * Decrement the link count on an inode & log the change.
- * If this causes the link count to go to zero, initiate the
- * logging activity required to truncate a file.
+ * Decrement the link count on an inode & log the change. If this causes the
+ * link count to go to zero, move the inode to AGI unlinked list so that it can
+ * be freed when the last active reference goes away via xfs_inactive().
*/
int /* error */
xfs_droplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
{
- int error;
-
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- ASSERT (ip->i_d.di_nlink > 0);
- ip->i_d.di_nlink--;
drop_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = 0;
- if (ip->i_d.di_nlink == 0) {
- /*
- * We're dropping the last link to this file.
- * Move the on-disk inode to the AGI unlinked list.
- * From xfs_inactive() we will pull the inode from
- * the list and free it.
- */
- error = xfs_iunlink(tp, ip);
- }
- return error;
+ if (VFS_I(ip)->i_nlink)
+ return 0;
+
+ return xfs_iunlink(tp, ip);
}
/*
@@ -1120,8 +1112,6 @@ xfs_bumplink(
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
ASSERT(ip->i_d.di_version > 1);
- ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
- ip->i_d.di_nlink++;
inc_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
return 0;
@@ -1143,7 +1133,6 @@ xfs_create(
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- int committed;
prid_t prid;
struct xfs_dquot *udqp = NULL;
struct xfs_dquot *gdqp = NULL;
@@ -1226,7 +1215,7 @@ xfs_create(
* pointing to itself.
*/
error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
- prid, resblks > 0, &ip, &committed);
+ prid, resblks > 0, &ip, NULL);
if (error)
goto out_trans_cancel;
@@ -1275,7 +1264,7 @@ xfs_create(
*/
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
@@ -1380,7 +1369,6 @@ xfs_create_tmpfile(
*/
xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
- ip->i_d.di_nlink--;
error = xfs_iunlink(tp, ip);
if (error)
goto out_trans_cancel;
@@ -1427,12 +1415,11 @@ xfs_link(
int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int committed;
int resblks;
trace_xfs_link(tdp, target_name);
- ASSERT(!S_ISDIR(sip->i_d.di_mode));
+ ASSERT(!S_ISDIR(VFS_I(sip)->i_mode));
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1480,7 +1467,10 @@ xfs_link(
xfs_bmap_init(&free_list, &first_block);
- if (sip->i_d.di_nlink == 0) {
+ /*
+ * Handle initial link state of O_TMPFILE inode
+ */
+ if (VFS_I(sip)->i_nlink == 0) {
error = xfs_iunlink_remove(tp, sip);
if (error)
goto error_return;
@@ -1502,11 +1492,10 @@ xfs_link(
* link transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- }
- error = xfs_bmap_finish (&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error) {
xfs_bmap_cancel(&free_list);
goto error_return;
@@ -1555,7 +1544,6 @@ xfs_itruncate_extents(
xfs_fileoff_t first_unmap_block;
xfs_fileoff_t last_block;
xfs_filblks_t unmap_len;
- int committed;
int error = 0;
int done = 0;
@@ -1601,9 +1589,7 @@ xfs_itruncate_extents(
* Duplicate the transaction that has the permanent
* reservation and commit the old transaction.
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (committed)
- xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_bmap_finish(&tp, &free_list, ip);
if (error)
goto out_bmap_cancel;
@@ -1640,7 +1626,7 @@ xfs_release(
xfs_mount_t *mp = ip->i_mount;
int error;
- if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
+ if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0))
return 0;
/* If this is a read-only mount, don't do this (would generate I/O) */
@@ -1671,7 +1657,7 @@ xfs_release(
}
}
- if (ip->i_d.di_nlink == 0)
+ if (VFS_I(ip)->i_nlink == 0)
return 0;
if (xfs_can_free_eofblocks(ip, false)) {
@@ -1774,7 +1760,6 @@ xfs_inactive_ifree(
{
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int committed;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error;
@@ -1841,7 +1826,7 @@ xfs_inactive_ifree(
* Just ignore errors at this point. There is nothing we can do except
* to try to keep going. Make sure it's not a silent error.
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error) {
xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
__func__, error);
@@ -1876,7 +1861,7 @@ xfs_inactive(
* If the inode is already free, then there can be nothing
* to clean up here.
*/
- if (ip->i_d.di_mode == 0) {
+ if (VFS_I(ip)->i_mode == 0) {
ASSERT(ip->i_df.if_real_bytes == 0);
ASSERT(ip->i_df.if_broot_bytes == 0);
return;
@@ -1888,7 +1873,7 @@ xfs_inactive(
if (mp->m_flags & XFS_MOUNT_RDONLY)
return;
- if (ip->i_d.di_nlink != 0) {
+ if (VFS_I(ip)->i_nlink != 0) {
/*
* force is true because we are evicting an inode from the
* cache. Post-eof blocks must be freed, lest we end up with
@@ -1900,7 +1885,7 @@ xfs_inactive(
return;
}
- if (S_ISREG(ip->i_d.di_mode) &&
+ if (S_ISREG(VFS_I(ip)->i_mode) &&
(ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
@@ -1909,7 +1894,7 @@ xfs_inactive(
if (error)
return;
- if (S_ISLNK(ip->i_d.di_mode))
+ if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
else if (truncate)
error = xfs_inactive_truncate(ip);
@@ -1945,16 +1930,21 @@ xfs_inactive(
}
/*
- * This is called when the inode's link count goes to 0.
- * We place the on-disk inode on a list in the AGI. It
- * will be pulled from this list when the inode is freed.
+ * This is called when the inode's link count goes to 0 or we are creating a
+ * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
+ * set to true as the link count is dropped to zero by the VFS after we've
+ * created the file successfully, so we have to add it to the unlinked list
+ * while the link count is non-zero.
+ *
+ * We place the on-disk inode on a list in the AGI. It will be pulled from this
+ * list when the inode is freed.
*/
-int
+STATIC int
xfs_iunlink(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp;
+ xfs_mount_t *mp = tp->t_mountp;
xfs_agi_t *agi;
xfs_dinode_t *dip;
xfs_buf_t *agibp;
@@ -1964,10 +1954,7 @@ xfs_iunlink(
int offset;
int error;
- ASSERT(ip->i_d.di_nlink == 0);
- ASSERT(ip->i_d.di_mode != 0);
-
- mp = tp->t_mountp;
+ ASSERT(VFS_I(ip)->i_mode != 0);
/*
* Get the agi buffer first. It ensures lock ordering
@@ -2365,6 +2352,7 @@ retry:
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
@@ -2404,10 +2392,10 @@ xfs_ifree(
struct xfs_icluster xic = { 0 };
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_d.di_nlink == 0);
+ ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(ip->i_d.di_nextents == 0);
ASSERT(ip->i_d.di_anextents == 0);
- ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
+ ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
ASSERT(ip->i_d.di_nblocks == 0);
/*
@@ -2421,7 +2409,7 @@ xfs_ifree(
if (error)
return error;
- ip->i_d.di_mode = 0; /* mark incore inode as free */
+ VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
@@ -2431,7 +2419,7 @@ xfs_ifree(
* Bump the generation count so no one will be confused
* by reincarnations of this inode.
*/
- ip->i_d.di_gen++;
+ VFS_I(ip)->i_generation++;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (xic.deleted)
@@ -2518,11 +2506,10 @@ xfs_remove(
{
xfs_mount_t *mp = dp->i_mount;
xfs_trans_t *tp = NULL;
- int is_dir = S_ISDIR(ip->i_d.di_mode);
+ int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
int error = 0;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
- int committed;
uint resblks;
trace_xfs_remove(dp, name);
@@ -2573,8 +2560,8 @@ xfs_remove(
* If we're removing a directory perform some additional validation.
*/
if (is_dir) {
- ASSERT(ip->i_d.di_nlink >= 2);
- if (ip->i_d.di_nlink != 2) {
+ ASSERT(VFS_I(ip)->i_nlink >= 2);
+ if (VFS_I(ip)->i_nlink != 2) {
error = -ENOTEMPTY;
goto out_trans_cancel;
}
@@ -2623,7 +2610,7 @@ xfs_remove(
if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
@@ -2700,7 +2687,6 @@ xfs_finish_rename(
struct xfs_trans *tp,
struct xfs_bmap_free *free_list)
{
- int committed = 0;
int error;
/*
@@ -2710,7 +2696,7 @@ xfs_finish_rename(
if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- error = xfs_bmap_finish(&tp, free_list, &committed);
+ error = xfs_bmap_finish(&tp, free_list, NULL);
if (error) {
xfs_bmap_cancel(free_list);
xfs_trans_cancel(tp);
@@ -2765,7 +2751,7 @@ xfs_cross_rename(
if (dp1 != dp2) {
dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
- if (S_ISDIR(ip2->i_d.di_mode)) {
+ if (S_ISDIR(VFS_I(ip2)->i_mode)) {
error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
dp1->i_ino, first_block,
free_list, spaceres);
@@ -2773,7 +2759,7 @@ xfs_cross_rename(
goto out_trans_abort;
/* transfer ip2 ".." reference to dp1 */
- if (!S_ISDIR(ip1->i_d.di_mode)) {
+ if (!S_ISDIR(VFS_I(ip1)->i_mode)) {
error = xfs_droplink(tp, dp2);
if (error)
goto out_trans_abort;
@@ -2792,7 +2778,7 @@ xfs_cross_rename(
ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
}
- if (S_ISDIR(ip1->i_d.di_mode)) {
+ if (S_ISDIR(VFS_I(ip1)->i_mode)) {
error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
dp2->i_ino, first_block,
free_list, spaceres);
@@ -2800,7 +2786,7 @@ xfs_cross_rename(
goto out_trans_abort;
/* transfer ip1 ".." reference to dp2 */
- if (!S_ISDIR(ip2->i_d.di_mode)) {
+ if (!S_ISDIR(VFS_I(ip2)->i_mode)) {
error = xfs_droplink(tp, dp1);
if (error)
goto out_trans_abort;
@@ -2897,7 +2883,7 @@ xfs_rename(
struct xfs_inode *inodes[__XFS_SORT_INODES];
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
- bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+ bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
int spaceres;
int error;
@@ -3026,12 +3012,12 @@ xfs_rename(
* target and source are directories and that target can be
* destroyed, or that neither is a directory.
*/
- if (S_ISDIR(target_ip->i_d.di_mode)) {
+ if (S_ISDIR(VFS_I(target_ip)->i_mode)) {
/*
* Make sure target dir is empty.
*/
if (!(xfs_dir_isempty(target_ip)) ||
- (target_ip->i_d.di_nlink > 2)) {
+ (VFS_I(target_ip)->i_nlink > 2)) {
error = -EEXIST;
goto out_trans_cancel;
}
@@ -3138,7 +3124,7 @@ xfs_rename(
* intermediate state on disk.
*/
if (wip) {
- ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0);
+ ASSERT(VFS_I(wip)->i_nlink == 0);
error = xfs_bumplink(tp, wip);
if (error)
goto out_bmap_cancel;
@@ -3271,8 +3257,8 @@ xfs_iflush_cluster(
}
if (clcount) {
- XFS_STATS_INC(xs_icluster_flushcnt);
- XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
}
out_free:
@@ -3307,7 +3293,7 @@ cluster_corrupt_out:
* mark it as stale and brelse.
*/
if (bp->b_iodone) {
- XFS_BUF_UNDONE(bp);
+ bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
xfs_buf_ioerror(bp, -EIO);
xfs_buf_ioend(bp);
@@ -3345,7 +3331,7 @@ xfs_iflush(
struct xfs_dinode *dip;
int error;
- XFS_STATS_INC(xs_iflush_count);
+ XFS_STATS_INC(mp, xs_iflush_count);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(xfs_isiflocked(ip));
@@ -3456,14 +3442,7 @@ xfs_iflush_int(
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
goto corrupt_out;
}
- if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
- mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
- xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
- __func__, ip->i_ino, ip, ip->i_d.di_magic);
- goto corrupt_out;
- }
- if (S_ISREG(ip->i_d.di_mode)) {
+ if (S_ISREG(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
(ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
@@ -3473,7 +3452,7 @@ xfs_iflush_int(
__func__, ip->i_ino, ip);
goto corrupt_out;
}
- } else if (S_ISDIR(ip->i_d.di_mode)) {
+ } else if (S_ISDIR(VFS_I(ip)->i_mode)) {
if (XFS_TEST_ERROR(
(ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
(ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
@@ -3517,12 +3496,11 @@ xfs_iflush_int(
ip->i_d.di_flushiter++;
/*
- * Copy the dirty parts of the inode into the on-disk
- * inode. We always copy out the core of the inode,
- * because if the inode is dirty at all the core must
- * be.
+ * Copy the dirty parts of the inode into the on-disk inode. We always
+ * copy out the core of the inode, because if the inode is dirty at all
+ * the core must be.
*/
- xfs_dinode_to_disk(dip, &ip->i_d);
+ xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn);
/* Wrap, we never let the log put out DI_MAX_FLUSH */
if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3560,6 +3538,7 @@ xfs_iflush_int(
*/
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
@@ -3573,10 +3552,6 @@ xfs_iflush_int(
*/
xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
- /* update the lsn in the on disk inode if required */
- if (ip->i_d.di_version == 3)
- dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
-
/* generate the checksum. */
xfs_dinode_calc_crc(mp, dip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ca9e11989cbd..43e1d51b15eb 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -63,7 +63,7 @@ typedef struct xfs_inode {
unsigned long i_flags; /* see defined flags below */
unsigned int i_delayed_blks; /* count of delay alloc blks */
- xfs_icdinode_t i_d; /* most of ondisk inode */
+ struct xfs_icdinode i_d; /* most of ondisk inode */
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
@@ -88,7 +88,7 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
*/
static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
{
- if (S_ISREG(ip->i_d.di_mode))
+ if (S_ISREG(VFS_I(ip)->i_mode))
return i_size_read(VFS_I(ip));
return ip->i_d.di_size;
}
@@ -369,7 +369,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
*/
#define XFS_INHERIT_GID(pip) \
(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
- ((pip)->i_d.di_mode & S_ISGID))
+ (VFS_I(pip)->i_mode & S_ISGID))
int xfs_release(struct xfs_inode *ip);
void xfs_inactive(struct xfs_inode *ip);
@@ -405,8 +405,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
struct xfs_bmap_free *);
int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
int, xfs_fsize_t);
-int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
-
void xfs_iext_realloc(xfs_inode_t *, int, int);
void xfs_iunpin_wait(xfs_inode_t *);
@@ -437,6 +435,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip,
int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
xfs_fsize_t isize, bool *did_zeroing);
int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
+ loff_t eof, int whence);
/* from xfs_iops.c */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 62bd80f4edd9..c48b5b18d771 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -135,7 +135,7 @@ xfs_inode_item_size(
*nvecs += 2;
*nbytes += sizeof(struct xfs_inode_log_format) +
- xfs_icdinode_size(ip->i_d.di_version);
+ xfs_log_dinode_size(ip->i_d.di_version);
xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
if (XFS_IFORK_Q(ip))
@@ -322,6 +322,81 @@ xfs_inode_item_format_attr_fork(
}
}
+static void
+xfs_inode_to_log_dinode(
+ struct xfs_inode *ip,
+ struct xfs_log_dinode *to,
+ xfs_lsn_t lsn)
+{
+ struct xfs_icdinode *from = &ip->i_d;
+ struct inode *inode = VFS_I(ip);
+
+ to->di_magic = XFS_DINODE_MAGIC;
+
+ to->di_version = from->di_version;
+ to->di_format = from->di_format;
+ to->di_uid = from->di_uid;
+ to->di_gid = from->di_gid;
+ to->di_projid_lo = from->di_projid_lo;
+ to->di_projid_hi = from->di_projid_hi;
+
+ memset(to->di_pad, 0, sizeof(to->di_pad));
+ memset(to->di_pad3, 0, sizeof(to->di_pad3));
+ to->di_atime.t_sec = inode->i_atime.tv_sec;
+ to->di_atime.t_nsec = inode->i_atime.tv_nsec;
+ to->di_mtime.t_sec = inode->i_mtime.tv_sec;
+ to->di_mtime.t_nsec = inode->i_mtime.tv_nsec;
+ to->di_ctime.t_sec = inode->i_ctime.tv_sec;
+ to->di_ctime.t_nsec = inode->i_ctime.tv_nsec;
+ to->di_nlink = inode->i_nlink;
+ to->di_gen = inode->i_generation;
+ to->di_mode = inode->i_mode;
+
+ to->di_size = from->di_size;
+ to->di_nblocks = from->di_nblocks;
+ to->di_extsize = from->di_extsize;
+ to->di_nextents = from->di_nextents;
+ to->di_anextents = from->di_anextents;
+ to->di_forkoff = from->di_forkoff;
+ to->di_aformat = from->di_aformat;
+ to->di_dmevmask = from->di_dmevmask;
+ to->di_dmstate = from->di_dmstate;
+ to->di_flags = from->di_flags;
+
+ if (from->di_version == 3) {
+ to->di_changecount = inode->i_version;
+ to->di_crtime.t_sec = from->di_crtime.t_sec;
+ to->di_crtime.t_nsec = from->di_crtime.t_nsec;
+ to->di_flags2 = from->di_flags2;
+
+ to->di_ino = ip->i_ino;
+ to->di_lsn = lsn;
+ memset(to->di_pad2, 0, sizeof(to->di_pad2));
+ uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
+ to->di_flushiter = 0;
+ } else {
+ to->di_flushiter = from->di_flushiter;
+ }
+}
+
+/*
+ * Format the inode core. Current timestamp data is only in the VFS inode
+ * fields, so we need to grab them from there. Hence rather than just copying
+ * the XFS inode core structure, format the fields directly into the iovec.
+ */
+static void
+xfs_inode_item_format_core(
+ struct xfs_inode *ip,
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp)
+{
+ struct xfs_log_dinode *dic;
+
+ dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
+ xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
+ xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
+}
+
/*
* This is called to fill in the vector of log iovecs for the given inode
* log item. It fills the first item with an inode log format structure,
@@ -351,10 +426,7 @@ xfs_inode_item_format(
ilf->ilf_size = 2; /* format + core */
xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
- xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
- &ip->i_d,
- xfs_icdinode_size(ip->i_d.di_version));
-
+ xfs_inode_item_format_core(ip, lv, &vecp);
xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
if (XFS_IFORK_Q(ip)) {
xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
@@ -719,6 +791,7 @@ xfs_iflush_abort(
* attempted.
*/
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
}
/*
* Release the inode's flush lock since we're done with it.
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 488d81254e28..4c7722e325b3 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,6 +34,7 @@ typedef struct xfs_inode_log_item {
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
+ unsigned int ili_fsync_fields; /* logged since last fsync */
} xfs_inode_log_item_t;
static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ea7d85af5310..bcb6c19ce3ea 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -40,6 +40,7 @@
#include "xfs_symlink.h"
#include "xfs_trans.h"
#include "xfs_pnfs.h"
+#include "xfs_acl.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -113,7 +114,7 @@ xfs_find_handle(
handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
sizeof(handle.ha_fid.fid_len);
handle.ha_fid.fid_pad = 0;
- handle.ha_fid.fid_gen = ip->i_d.di_gen;
+ handle.ha_fid.fid_gen = inode->i_generation;
handle.ha_fid.fid_ino = ip->i_ino;
hsize = XFS_HSIZE(handle);
@@ -411,7 +412,7 @@ xfs_attrlist_by_handle(
if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XATTR_LIST_MAX)
+ al_hreq.buflen > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
@@ -455,7 +456,7 @@ xfs_attrmulti_attr_get(
unsigned char *kbuf;
int error = -EFAULT;
- if (*len > XATTR_SIZE_MAX)
+ if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
kbuf = kmem_zalloc_large(*len, KM_SLEEP);
if (!kbuf)
@@ -482,17 +483,22 @@ xfs_attrmulti_attr_set(
__uint32_t flags)
{
unsigned char *kbuf;
+ int error;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- if (len > XATTR_SIZE_MAX)
+ if (len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
kbuf = memdup_user(ubuf, len);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ if (!error)
+ xfs_forget_acl(inode, name, flags);
+ kfree(kbuf);
+ return error;
}
int
@@ -501,9 +507,14 @@ xfs_attrmulti_attr_remove(
unsigned char *name,
__uint32_t flags)
{
+ int error;
+
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- return xfs_attr_remove(XFS_I(inode), name, flags);
+ error = xfs_attr_remove(XFS_I(inode), name, flags);
+ if (!error)
+ xfs_forget_acl(inode, name, flags);
+ return error;
}
STATIC int
@@ -848,25 +859,25 @@ xfs_merge_ioc_xflags(
unsigned int xflags = start;
if (flags & FS_IMMUTABLE_FL)
- xflags |= XFS_XFLAG_IMMUTABLE;
+ xflags |= FS_XFLAG_IMMUTABLE;
else
- xflags &= ~XFS_XFLAG_IMMUTABLE;
+ xflags &= ~FS_XFLAG_IMMUTABLE;
if (flags & FS_APPEND_FL)
- xflags |= XFS_XFLAG_APPEND;
+ xflags |= FS_XFLAG_APPEND;
else
- xflags &= ~XFS_XFLAG_APPEND;
+ xflags &= ~FS_XFLAG_APPEND;
if (flags & FS_SYNC_FL)
- xflags |= XFS_XFLAG_SYNC;
+ xflags |= FS_XFLAG_SYNC;
else
- xflags &= ~XFS_XFLAG_SYNC;
+ xflags &= ~FS_XFLAG_SYNC;
if (flags & FS_NOATIME_FL)
- xflags |= XFS_XFLAG_NOATIME;
+ xflags |= FS_XFLAG_NOATIME;
else
- xflags &= ~XFS_XFLAG_NOATIME;
+ xflags &= ~FS_XFLAG_NOATIME;
if (flags & FS_NODUMP_FL)
- xflags |= XFS_XFLAG_NODUMP;
+ xflags |= FS_XFLAG_NODUMP;
else
- xflags &= ~XFS_XFLAG_NODUMP;
+ xflags &= ~FS_XFLAG_NODUMP;
return xflags;
}
@@ -934,40 +945,51 @@ xfs_set_diflags(
unsigned int xflags)
{
unsigned int di_flags;
+ uint64_t di_flags2;
/* can't set PREALLOC this way, just preserve it */
di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
- if (xflags & XFS_XFLAG_IMMUTABLE)
+ if (xflags & FS_XFLAG_IMMUTABLE)
di_flags |= XFS_DIFLAG_IMMUTABLE;
- if (xflags & XFS_XFLAG_APPEND)
+ if (xflags & FS_XFLAG_APPEND)
di_flags |= XFS_DIFLAG_APPEND;
- if (xflags & XFS_XFLAG_SYNC)
+ if (xflags & FS_XFLAG_SYNC)
di_flags |= XFS_DIFLAG_SYNC;
- if (xflags & XFS_XFLAG_NOATIME)
+ if (xflags & FS_XFLAG_NOATIME)
di_flags |= XFS_DIFLAG_NOATIME;
- if (xflags & XFS_XFLAG_NODUMP)
+ if (xflags & FS_XFLAG_NODUMP)
di_flags |= XFS_DIFLAG_NODUMP;
- if (xflags & XFS_XFLAG_NODEFRAG)
+ if (xflags & FS_XFLAG_NODEFRAG)
di_flags |= XFS_DIFLAG_NODEFRAG;
- if (xflags & XFS_XFLAG_FILESTREAM)
+ if (xflags & FS_XFLAG_FILESTREAM)
di_flags |= XFS_DIFLAG_FILESTREAM;
- if (S_ISDIR(ip->i_d.di_mode)) {
- if (xflags & XFS_XFLAG_RTINHERIT)
+ if (S_ISDIR(VFS_I(ip)->i_mode)) {
+ if (xflags & FS_XFLAG_RTINHERIT)
di_flags |= XFS_DIFLAG_RTINHERIT;
- if (xflags & XFS_XFLAG_NOSYMLINKS)
+ if (xflags & FS_XFLAG_NOSYMLINKS)
di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if (xflags & XFS_XFLAG_EXTSZINHERIT)
+ if (xflags & FS_XFLAG_EXTSZINHERIT)
di_flags |= XFS_DIFLAG_EXTSZINHERIT;
- if (xflags & XFS_XFLAG_PROJINHERIT)
+ if (xflags & FS_XFLAG_PROJINHERIT)
di_flags |= XFS_DIFLAG_PROJINHERIT;
- } else if (S_ISREG(ip->i_d.di_mode)) {
- if (xflags & XFS_XFLAG_REALTIME)
+ } else if (S_ISREG(VFS_I(ip)->i_mode)) {
+ if (xflags & FS_XFLAG_REALTIME)
di_flags |= XFS_DIFLAG_REALTIME;
- if (xflags & XFS_XFLAG_EXTSIZE)
+ if (xflags & FS_XFLAG_EXTSIZE)
di_flags |= XFS_DIFLAG_EXTSIZE;
}
-
ip->i_d.di_flags = di_flags;
+
+ /* diflags2 only valid for v3 inodes. */
+ if (ip->i_d.di_version < 3)
+ return;
+
+ di_flags2 = 0;
+ if (xflags & FS_XFLAG_DAX)
+ di_flags2 |= XFS_DIFLAG2_DAX;
+
+ ip->i_d.di_flags2 = di_flags2;
+
}
STATIC void
@@ -977,22 +999,27 @@ xfs_diflags_to_linux(
struct inode *inode = VFS_I(ip);
unsigned int xflags = xfs_ip2xflags(ip);
- if (xflags & XFS_XFLAG_IMMUTABLE)
+ if (xflags & FS_XFLAG_IMMUTABLE)
inode->i_flags |= S_IMMUTABLE;
else
inode->i_flags &= ~S_IMMUTABLE;
- if (xflags & XFS_XFLAG_APPEND)
+ if (xflags & FS_XFLAG_APPEND)
inode->i_flags |= S_APPEND;
else
inode->i_flags &= ~S_APPEND;
- if (xflags & XFS_XFLAG_SYNC)
+ if (xflags & FS_XFLAG_SYNC)
inode->i_flags |= S_SYNC;
else
inode->i_flags &= ~S_SYNC;
- if (xflags & XFS_XFLAG_NOATIME)
+ if (xflags & FS_XFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
else
inode->i_flags &= ~S_NOATIME;
+ if (xflags & FS_XFLAG_DAX)
+ inode->i_flags |= S_DAX;
+ else
+ inode->i_flags &= ~S_DAX;
+
}
static int
@@ -1005,11 +1032,11 @@ xfs_ioctl_setattr_xflags(
/* Can't change realtime flag if any extents are allocated. */
if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
- XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
+ XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
return -EINVAL;
/* If realtime flag is set then must have realtime device */
- if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
+ if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
(ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
return -EINVAL;
@@ -1020,7 +1047,7 @@ xfs_ioctl_setattr_xflags(
* we have appropriate permission.
*/
if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
- (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+ (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) &&
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
@@ -1028,8 +1055,63 @@ xfs_ioctl_setattr_xflags(
xfs_diflags_to_linux(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
+ return 0;
+}
+
+/*
+ * If we are changing DAX flags, we have to ensure the file is clean and any
+ * cached objects in the address space are invalidated and removed. This
+ * requires us to lock out other IO and page faults similar to a truncate
+ * operation. The locks need to be held until the transaction has been committed
+ * so that the cache invalidation is atomic with respect to the DAX flag
+ * manipulation.
+ */
+static int
+xfs_ioctl_setattr_dax_invalidate(
+ struct xfs_inode *ip,
+ struct fsxattr *fa,
+ int *join_flags)
+{
+ struct inode *inode = VFS_I(ip);
+ int error;
+
+ *join_flags = 0;
+
+ /*
+ * It is only valid to set the DAX flag on regular files and
+ * directories on filesystems where the block size is equal to the page
+ * size. On directories it serves as an inherit hint.
+ */
+ if (fa->fsx_xflags & FS_XFLAG_DAX) {
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+ return -EINVAL;
+ if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
+ return -EINVAL;
+ }
+
+ /* If the DAX state is not changing, we have nothing to do here. */
+ if ((fa->fsx_xflags & FS_XFLAG_DAX) && IS_DAX(inode))
+ return 0;
+ if (!(fa->fsx_xflags & FS_XFLAG_DAX) && !IS_DAX(inode))
+ return 0;
+
+ /* lock, flush and invalidate mapping in preparation for flag change */
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
+ error = filemap_write_and_wait(inode->i_mapping);
+ if (error)
+ goto out_unlock;
+ error = invalidate_inode_pages2(inode->i_mapping);
+ if (error)
+ goto out_unlock;
+
+ *join_flags = XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL;
return 0;
+
+out_unlock:
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL | XFS_IOLOCK_EXCL);
+ return error;
+
}
/*
@@ -1037,19 +1119,27 @@ xfs_ioctl_setattr_xflags(
* have permission to do so. On success, return a clean transaction and the
* inode locked exclusively ready for further operation specific checks. On
* failure, return an error without modifying or locking the inode.
+ *
+ * The inode might already be IO locked on call. If this is the case, it is
+ * indicated in @join_flags and we take full responsibility for ensuring they
+ * are unlocked from now on. Hence if we have an error here, we still have to
+ * unlock them. Otherwise, once they are joined to the transaction, they will
+ * be unlocked on commit/cancel.
*/
static struct xfs_trans *
xfs_ioctl_setattr_get_trans(
- struct xfs_inode *ip)
+ struct xfs_inode *ip,
+ int join_flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
- int error;
+ int error = -EROFS;
if (mp->m_flags & XFS_MOUNT_RDONLY)
- return ERR_PTR(-EROFS);
+ goto out_unlock;
+ error = -EIO;
if (XFS_FORCED_SHUTDOWN(mp))
- return ERR_PTR(-EIO);
+ goto out_unlock;
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
@@ -1057,7 +1147,8 @@ xfs_ioctl_setattr_get_trans(
goto out_cancel;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
+ join_flags = 0;
/*
* CAP_FOWNER overrides the following restrictions:
@@ -1077,6 +1168,9 @@ xfs_ioctl_setattr_get_trans(
out_cancel:
xfs_trans_cancel(tp);
+out_unlock:
+ if (join_flags)
+ xfs_iunlock(ip, join_flags);
return ERR_PTR(error);
}
@@ -1084,8 +1178,8 @@ out_cancel:
* extent size hint validation is somewhat cumbersome. Rules are:
*
* 1. extent size hint is only valid for directories and regular files
- * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
- * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 2. FS_XFLAG_EXTSIZE is only valid for regular files
+ * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
* 4. can only be changed on regular files if no extents are allocated
* 5. can be changed on directories at any time
* 6. extsize hint of 0 turns off hints, clears inode flags.
@@ -1101,14 +1195,14 @@ xfs_ioctl_setattr_check_extsize(
{
struct xfs_mount *mp = ip->i_mount;
- if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode))
return -EINVAL;
- if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
- !S_ISDIR(ip->i_d.di_mode))
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
+ !S_ISDIR(VFS_I(ip)->i_mode))
return -EINVAL;
- if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents &&
+ if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
return -EINVAL;
@@ -1121,7 +1215,7 @@ xfs_ioctl_setattr_check_extsize(
return -EINVAL;
if (XFS_IS_REALTIME_INODE(ip) ||
- (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+ (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
} else {
size = mp->m_sb.sb_blocksize;
@@ -1132,7 +1226,7 @@ xfs_ioctl_setattr_check_extsize(
if (fa->fsx_extsize % size)
return -EINVAL;
} else
- fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
+ fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
return 0;
}
@@ -1157,7 +1251,7 @@ xfs_ioctl_setattr_check_projid(
if (xfs_get_projid(ip) != fa->fsx_projid)
return -EINVAL;
- if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
+ if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) !=
(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
return -EINVAL;
@@ -1175,6 +1269,7 @@ xfs_ioctl_setattr(
struct xfs_dquot *pdqp = NULL;
struct xfs_dquot *olddquot = NULL;
int code;
+ int join_flags = 0;
trace_xfs_ioctl_setattr(ip);
@@ -1198,7 +1293,18 @@ xfs_ioctl_setattr(
return code;
}
- tp = xfs_ioctl_setattr_get_trans(ip);
+ /*
+ * Changing DAX config may require inode locking for mapping
+ * invalidation. These need to be held all the way to transaction commit
+ * or cancel time, so need to be passed through to
+ * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
+ * appropriately.
+ */
+ code = xfs_ioctl_setattr_dax_invalidate(ip, fa, &join_flags);
+ if (code)
+ goto error_free_dquots;
+
+ tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
if (IS_ERR(tp)) {
code = PTR_ERR(tp);
goto error_free_dquots;
@@ -1229,9 +1335,9 @@ xfs_ioctl_setattr(
* successful return from chown()
*/
- if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+ if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) &&
!capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
- ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+ VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
/* Change the ownerships and register project quota modifications */
if (xfs_get_projid(ip) != fa->fsx_projid) {
@@ -1314,6 +1420,7 @@ xfs_ioc_setxflags(
struct xfs_trans *tp;
struct fsxattr fa;
unsigned int flags;
+ int join_flags = 0;
int error;
if (copy_from_user(&flags, arg, sizeof(flags)))
@@ -1330,7 +1437,18 @@ xfs_ioc_setxflags(
if (error)
return error;
- tp = xfs_ioctl_setattr_get_trans(ip);
+ /*
+ * Changing DAX config may require inode locking for mapping
+ * invalidation. These need to be held all the way to transaction commit
+ * or cancel time, so need to be passed through to
+ * xfs_ioctl_setattr_get_trans() so it can apply them to the join call
+ * appropriately.
+ */
+ error = xfs_ioctl_setattr_dax_invalidate(ip, &fa, &join_flags);
+ if (error)
+ goto out_drop_write;
+
+ tp = xfs_ioctl_setattr_get_trans(ip, join_flags);
if (IS_ERR(tp)) {
error = PTR_ERR(tp);
goto out_drop_write;
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index b88bdc85dd3d..1a05d8ae327d 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -356,7 +356,7 @@ xfs_compat_attrlist_by_handle(
sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XATTR_LIST_MAX)
+ al_hreq.buflen > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1f86033171c8..d81bdc080370 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -129,22 +129,31 @@ xfs_iomap_write_direct(
xfs_trans_t *tp;
xfs_bmap_free_t free_list;
uint qblocks, resblks, resrtextents;
- int committed;
int error;
-
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return error;
+ int lockmode;
+ int bmapi_flags = XFS_BMAPI_PREALLOC;
rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip);
+ lockmode = XFS_ILOCK_SHARED; /* locked by caller */
+
+ ASSERT(xfs_isilocked(ip, lockmode));
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
if ((offset + count) > XFS_ISIZE(ip)) {
+ /*
+ * Assert that the in-core extent list is present since this can
+ * call xfs_iread_extents() and we only have the ilock shared.
+ * This should be safe because the lock was held around a bmapi
+ * call in the caller and we only need it to access the in-core
+ * list.
+ */
+ ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
+ XFS_IFEXTENTS);
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
- return error;
+ goto out_unlock;
} else {
if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -174,9 +183,40 @@ xfs_iomap_write_direct(
}
/*
+ * Drop the shared lock acquired by the caller, attach the dquot if
+ * necessary and move on to transaction setup.
+ */
+ xfs_iunlock(ip, lockmode);
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ /*
* Allocate and setup the transaction
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+
+ /*
+ * For DAX, we do not allocate unwritten extents, but instead we zero
+ * the block before we commit the transaction. Ideally we'd like to do
+ * this outside the transaction context, but if we commit and then crash
+ * we may not have zeroed the blocks and this will be exposed on
+ * recovery of the allocation. Hence we must zero before commit.
+ *
+ * Further, if we are mapping unwritten extents here, we need to zero
+ * and convert them to written so that we don't need an unwritten extent
+ * callback for DAX. This also means that we need to be able to dip into
+ * the reserve block pool for bmbt block allocation if there is no space
+ * left but we need to do unwritten extent conversion.
+ */
+
+ if (IS_DAX(VFS_I(ip))) {
+ bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
+ if (ISUNWRITTEN(imap)) {
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ }
+ }
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
resblks, resrtextents);
/*
@@ -187,7 +227,8 @@ xfs_iomap_write_direct(
return error;
}
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lockmode);
error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
if (error)
@@ -202,17 +243,18 @@ xfs_iomap_write_direct(
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_PREALLOC, &firstfsb, 0,
- imap, &nimaps, &free_list);
+ bmapi_flags, &firstfsb, resblks, imap,
+ &nimaps, &free_list);
if (error)
goto out_bmap_cancel;
/*
* Complete the transaction
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
+
error = xfs_trans_commit(tp);
if (error)
goto out_unlock;
@@ -229,7 +271,7 @@ xfs_iomap_write_direct(
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return error;
out_bmap_cancel:
@@ -655,7 +697,7 @@ xfs_iomap_write_allocate(
xfs_bmap_free_t free_list;
xfs_filblks_t count_fsb;
xfs_trans_t *tp;
- int nimaps, committed;
+ int nimaps;
int error = 0;
int nres;
@@ -670,7 +712,7 @@ xfs_iomap_write_allocate(
count_fsb = imap->br_blockcount;
map_start_fsb = imap->br_startoff;
- XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
while (count_fsb != 0) {
/*
@@ -750,13 +792,13 @@ xfs_iomap_write_allocate(
* pointer that the caller gave to us.
*/
error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, 0,
- &first_block, 1,
- imap, &nimaps, &free_list);
+ count_fsb, 0, &first_block,
+ nres, imap, &nimaps,
+ &free_list);
if (error)
goto trans_cancel;
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto trans_cancel;
@@ -777,7 +819,7 @@ xfs_iomap_write_allocate(
if ((offset_fsb >= imap->br_startoff) &&
(offset_fsb < (imap->br_startoff +
imap->br_blockcount))) {
- XFS_STATS_INC(xs_xstrat_quick);
+ XFS_STATS_INC(mp, xs_xstrat_quick);
return 0;
}
@@ -814,7 +856,6 @@ xfs_iomap_write_unwritten(
xfs_bmap_free_t free_list;
xfs_fsize_t i_size;
uint resblks;
- int committed;
int error;
trace_xfs_unwritten_convert(ip, offset, count);
@@ -866,8 +907,8 @@ xfs_iomap_write_unwritten(
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_CONVERT, &firstfsb,
- 1, &imap, &nimaps, &free_list);
+ XFS_BMAPI_CONVERT, &firstfsb, resblks,
+ &imap, &nimaps, &free_list);
if (error)
goto error_on_bmapi_transaction;
@@ -886,7 +927,7 @@ xfs_iomap_write_unwritten(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 8294132e6a3c..fb7dc61f4a29 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -414,13 +414,17 @@ xfs_vn_rename(
* uio is kmalloced for this reason...
*/
STATIC const char *
-xfs_vn_follow_link(
+xfs_vn_get_link(
struct dentry *dentry,
- void **cookie)
+ struct inode *inode,
+ struct delayed_call *done)
{
char *link;
int error = -ENOMEM;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
if (!link)
goto out_err;
@@ -429,7 +433,8 @@ xfs_vn_follow_link(
if (unlikely(error))
goto out_kfree;
- return *cookie = link;
+ set_delayed_call(done, kfree_link, link);
+ return link;
out_kfree:
kfree(link);
@@ -454,8 +459,8 @@ xfs_vn_getattr(
stat->size = XFS_ISIZE(ip);
stat->dev = inode->i_sb->s_dev;
- stat->mode = ip->i_d.di_mode;
- stat->nlink = ip->i_d.di_nlink;
+ stat->mode = inode->i_mode;
+ stat->nlink = inode->i_nlink;
stat->uid = inode->i_uid;
stat->gid = inode->i_gid;
stat->ino = ip->i_ino;
@@ -501,9 +506,6 @@ xfs_setattr_mode(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ip->i_d.di_mode &= S_IFMT;
- ip->i_d.di_mode |= mode & ~S_IFMT;
-
inode->i_mode &= S_IFMT;
inode->i_mode |= mode & ~S_IFMT;
}
@@ -517,21 +519,12 @@ xfs_setattr_time(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (iattr->ia_valid & ATTR_ATIME) {
+ if (iattr->ia_valid & ATTR_ATIME)
inode->i_atime = iattr->ia_atime;
- ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
- }
- if (iattr->ia_valid & ATTR_CTIME) {
+ if (iattr->ia_valid & ATTR_CTIME)
inode->i_ctime = iattr->ia_ctime;
- ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
- }
- if (iattr->ia_valid & ATTR_MTIME) {
+ if (iattr->ia_valid & ATTR_MTIME)
inode->i_mtime = iattr->ia_mtime;
- ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- }
}
int
@@ -656,9 +649,9 @@ xfs_setattr_nonsize(
* The set-user-ID and set-group-ID bits of a file will be
* cleared upon successful return from chown()
*/
- if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+ if ((inode->i_mode & (S_ISUID|S_ISGID)) &&
!capable(CAP_FSETID))
- ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+ inode->i_mode &= ~(S_ISUID|S_ISGID);
/*
* Change the ownerships and register quota modifications
@@ -695,7 +688,7 @@ xfs_setattr_nonsize(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
@@ -768,7 +761,7 @@ xfs_setattr_size(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
- ASSERT(S_ISREG(ip->i_d.di_mode));
+ ASSERT(S_ISREG(inode->i_mode));
ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
@@ -922,7 +915,7 @@ xfs_setattr_size(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
@@ -986,21 +979,13 @@ xfs_vn_update_time(
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (flags & S_CTIME) {
+ if (flags & S_CTIME)
inode->i_ctime = *now;
- ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec;
- }
- if (flags & S_MTIME) {
+ if (flags & S_MTIME)
inode->i_mtime = *now;
- ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec;
- }
- if (flags & S_ATIME) {
+ if (flags & S_ATIME)
inode->i_atime = *now;
- ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec;
- }
+
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
return xfs_trans_commit(tp);
@@ -1172,8 +1157,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
static const struct inode_operations xfs_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = xfs_vn_follow_link,
- .put_link = kfree_put_link,
+ .get_link = xfs_vn_get_link,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -1201,8 +1185,10 @@ xfs_diflags_to_iflags(
inode->i_flags |= S_SYNC;
if (flags & XFS_DIFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
- /* XXX: Also needs an on-disk per inode flag! */
- if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+ if (S_ISREG(inode->i_mode) &&
+ ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
+ (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
+ ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
inode->i_flags |= S_DAX;
}
@@ -1228,8 +1214,6 @@ xfs_setup_inode(
/* make the inode look hashed for the writeback code */
hlist_add_fake(&inode->i_hash);
- inode->i_mode = ip->i_d.di_mode;
- set_nlink(inode, ip->i_d.di_nlink);
inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
@@ -1245,14 +1229,7 @@ xfs_setup_inode(
break;
}
- inode->i_generation = ip->i_d.di_gen;
i_size_write(inode, ip->i_d.di_size);
- inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec;
- inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
- inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
- inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
- inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
- inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
xfs_diflags_to_iflags(inode, ip);
ip->d_ops = ip->i_mount->m_nondir_inode_ops;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 930ebd86beba..ce73eb34620d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -57,6 +57,7 @@ xfs_bulkstat_one_int(
{
struct xfs_icdinode *dic; /* dinode core info pointer */
struct xfs_inode *ip; /* incore inode pointer */
+ struct inode *inode;
struct xfs_bstat *buf; /* return buffer */
int error = 0; /* error value */
@@ -77,30 +78,33 @@ xfs_bulkstat_one_int(
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
+ inode = VFS_I(ip);
dic = &ip->i_d;
/* xfs_iget returns the following without needing
* further change.
*/
- buf->bs_nlink = dic->di_nlink;
buf->bs_projid_lo = dic->di_projid_lo;
buf->bs_projid_hi = dic->di_projid_hi;
buf->bs_ino = ino;
- buf->bs_mode = dic->di_mode;
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
- buf->bs_atime.tv_sec = dic->di_atime.t_sec;
- buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
- buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
- buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
- buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
- buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
+
+ buf->bs_nlink = inode->i_nlink;
+ buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
+ buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
+ buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
+ buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
+ buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
+ buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+ buf->bs_gen = inode->i_generation;
+ buf->bs_mode = inode->i_mode;
+
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
buf->bs_extents = dic->di_nextents;
- buf->bs_gen = dic->di_gen;
memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
buf->bs_dmevmask = dic->di_dmevmask;
buf->bs_dmstate = dic->di_dmstate;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 85f883dd6207..a8192dc797dc 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -135,7 +135,7 @@ typedef __u32 xfs_nlink_t;
* Size of block device i/o is parameterized here.
* Currently the system supports page-sized i/o.
*/
-#define BLKDEV_IOSHIFT PAGE_CACHE_SHIFT
+#define BLKDEV_IOSHIFT PAGE_SHIFT
#define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT)
/* number of BB's per block device block */
#define BLKDEV_BB BTOBB(BLKDEV_IOSIZE)
@@ -171,6 +171,13 @@ struct xfs_kobj {
struct completion complete;
};
+struct xstats {
+ struct xfsstats __percpu *xs_stats;
+ struct xfs_kobj xs_kobj;
+};
+
+extern struct xstats xfsstats;
+
/* Kernel uid/gid conversion. These are used to convert to/from the on disk
* uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
* The conversion here is type only, the value will remain the same since we
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index aaadee0969c9..b49ccf5c1d75 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -268,7 +268,7 @@ xlog_grant_head_wait(
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&head->lock);
- XFS_STATS_INC(xs_sleep_logspace);
+ XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
trace_xfs_log_grant_sleep(log, tic);
schedule();
@@ -379,7 +379,7 @@ xfs_log_regrant(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- XFS_STATS_INC(xs_try_logspace);
+ XFS_STATS_INC(mp, xs_try_logspace);
/*
* This is a new transaction on the ticket, so we need to change the
@@ -448,7 +448,7 @@ xfs_log_reserve(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- XFS_STATS_INC(xs_try_logspace);
+ XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
@@ -1188,10 +1188,16 @@ xlog_iodone(xfs_buf_t *bp)
int aborted = 0;
/*
- * Race to shutdown the filesystem if we see an error.
+ * Race to shutdown the filesystem if we see an error or the iclog is in
+ * IOABORT state. The IOABORT state is only set in DEBUG mode to inject
+ * CRC errors into log recovery.
*/
- if (XFS_TEST_ERROR(bp->b_error, l->l_mp,
- XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
+ if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR,
+ XFS_RANDOM_IODONE_IOERR) ||
+ iclog->ic_state & XLOG_STATE_IOABORT) {
+ if (iclog->ic_state & XLOG_STATE_IOABORT)
+ iclog->ic_state &= ~XLOG_STATE_IOABORT;
+
xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_stale(bp);
xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
@@ -1206,7 +1212,7 @@ xlog_iodone(xfs_buf_t *bp)
}
/* log I/O is always issued ASYNC */
- ASSERT(XFS_BUF_ISASYNC(bp));
+ ASSERT(bp->b_flags & XBF_ASYNC);
xlog_state_done_syncing(iclog, aborted);
/*
@@ -1768,7 +1774,7 @@ xlog_sync(
int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
int size;
- XFS_STATS_INC(xs_log_writes);
+ XFS_STATS_INC(log->l_mp, xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
/* Add for LR header */
@@ -1805,7 +1811,7 @@ xlog_sync(
bp = iclog->ic_bp;
XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
- XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
+ XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
/* Do we need to split this write into 2 parts? */
if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
@@ -1838,12 +1844,28 @@ xlog_sync(
/* calculcate the checksum */
iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
iclog->ic_datap, size);
+#ifdef DEBUG
+ /*
+ * Intentionally corrupt the log record CRC based on the error injection
+ * frequency, if defined. This facilitates testing log recovery in the
+ * event of torn writes. Hence, set the IOABORT state to abort the log
+ * write on I/O completion and shutdown the fs. The subsequent mount
+ * detects the bad CRC and attempts to recover.
+ */
+ if (log->l_badcrc_factor &&
+ (prandom_u32() % log->l_badcrc_factor == 0)) {
+ iclog->ic_header.h_crc &= 0xAAAAAAAA;
+ iclog->ic_state |= XLOG_STATE_IOABORT;
+ xfs_warn(log->l_mp,
+ "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
+ be64_to_cpu(iclog->ic_header.h_lsn));
+ }
+#endif
bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
- XFS_BUF_ZEROFLAGS(bp);
- XFS_BUF_ASYNC(bp);
- bp->b_flags |= XBF_SYNCIO;
+ bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
+ bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
bp->b_flags |= XBF_FUA;
@@ -1870,12 +1892,11 @@ xlog_sync(
/* account for log which doesn't start at block #0 */
XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+
/*
* Don't call xfs_bwrite here. We do log-syncs even when the filesystem
* is shutting down.
*/
- XFS_BUF_WRITE(bp);
-
error = xlog_bdstrat(bp);
if (error) {
xfs_buf_ioerror_alert(bp, "xlog_sync");
@@ -1887,9 +1908,8 @@ xlog_sync(
xfs_buf_associate_memory(bp,
(char *)&iclog->ic_header + count, split);
bp->b_fspriv = iclog;
- XFS_BUF_ZEROFLAGS(bp);
- XFS_BUF_ASYNC(bp);
- bp->b_flags |= XBF_SYNCIO;
+ bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
+ bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
bp->b_flags |= XBF_FUA;
@@ -1898,7 +1918,6 @@ xlog_sync(
/* account for internal log which doesn't start at block #0 */
XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
- XFS_BUF_WRITE(bp);
error = xlog_bdstrat(bp);
if (error) {
xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
@@ -1989,75 +2008,81 @@ xlog_print_tic_res(
uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
/* match with XLOG_REG_TYPE_* in xfs_log.h */
- static char *res_type_str[XLOG_REG_TYPE_MAX] = {
- "bformat",
- "bchunk",
- "efi_format",
- "efd_format",
- "iformat",
- "icore",
- "iext",
- "ibroot",
- "ilocal",
- "iattr_ext",
- "iattr_broot",
- "iattr_local",
- "qformat",
- "dquot",
- "quotaoff",
- "LR header",
- "unmount",
- "commit",
- "trans header"
+#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
+ static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = {
+ REG_TYPE_STR(BFORMAT, "bformat"),
+ REG_TYPE_STR(BCHUNK, "bchunk"),
+ REG_TYPE_STR(EFI_FORMAT, "efi_format"),
+ REG_TYPE_STR(EFD_FORMAT, "efd_format"),
+ REG_TYPE_STR(IFORMAT, "iformat"),
+ REG_TYPE_STR(ICORE, "icore"),
+ REG_TYPE_STR(IEXT, "iext"),
+ REG_TYPE_STR(IBROOT, "ibroot"),
+ REG_TYPE_STR(ILOCAL, "ilocal"),
+ REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
+ REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
+ REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
+ REG_TYPE_STR(QFORMAT, "qformat"),
+ REG_TYPE_STR(DQUOT, "dquot"),
+ REG_TYPE_STR(QUOTAOFF, "quotaoff"),
+ REG_TYPE_STR(LRHEADER, "LR header"),
+ REG_TYPE_STR(UNMOUNT, "unmount"),
+ REG_TYPE_STR(COMMIT, "commit"),
+ REG_TYPE_STR(TRANSHDR, "trans header"),
+ REG_TYPE_STR(ICREATE, "inode create")
};
+#undef REG_TYPE_STR
+#define TRANS_TYPE_STR(type) [XFS_TRANS_##type] = #type
static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
- "SETATTR_NOT_SIZE",
- "SETATTR_SIZE",
- "INACTIVE",
- "CREATE",
- "CREATE_TRUNC",
- "TRUNCATE_FILE",
- "REMOVE",
- "LINK",
- "RENAME",
- "MKDIR",
- "RMDIR",
- "SYMLINK",
- "SET_DMATTRS",
- "GROWFS",
- "STRAT_WRITE",
- "DIOSTRAT",
- "WRITE_SYNC",
- "WRITEID",
- "ADDAFORK",
- "ATTRINVAL",
- "ATRUNCATE",
- "ATTR_SET",
- "ATTR_RM",
- "ATTR_FLAG",
- "CLEAR_AGI_BUCKET",
- "QM_SBCHANGE",
- "DUMMY1",
- "DUMMY2",
- "QM_QUOTAOFF",
- "QM_DQALLOC",
- "QM_SETQLIM",
- "QM_DQCLUSTER",
- "QM_QINOCREATE",
- "QM_QUOTAOFF_END",
- "SB_UNIT",
- "FSYNC_TS",
- "GROWFSRT_ALLOC",
- "GROWFSRT_ZERO",
- "GROWFSRT_FREE",
- "SWAPEXT"
+ TRANS_TYPE_STR(SETATTR_NOT_SIZE),
+ TRANS_TYPE_STR(SETATTR_SIZE),
+ TRANS_TYPE_STR(INACTIVE),
+ TRANS_TYPE_STR(CREATE),
+ TRANS_TYPE_STR(CREATE_TRUNC),
+ TRANS_TYPE_STR(TRUNCATE_FILE),
+ TRANS_TYPE_STR(REMOVE),
+ TRANS_TYPE_STR(LINK),
+ TRANS_TYPE_STR(RENAME),
+ TRANS_TYPE_STR(MKDIR),
+ TRANS_TYPE_STR(RMDIR),
+ TRANS_TYPE_STR(SYMLINK),
+ TRANS_TYPE_STR(SET_DMATTRS),
+ TRANS_TYPE_STR(GROWFS),
+ TRANS_TYPE_STR(STRAT_WRITE),
+ TRANS_TYPE_STR(DIOSTRAT),
+ TRANS_TYPE_STR(WRITEID),
+ TRANS_TYPE_STR(ADDAFORK),
+ TRANS_TYPE_STR(ATTRINVAL),
+ TRANS_TYPE_STR(ATRUNCATE),
+ TRANS_TYPE_STR(ATTR_SET),
+ TRANS_TYPE_STR(ATTR_RM),
+ TRANS_TYPE_STR(ATTR_FLAG),
+ TRANS_TYPE_STR(CLEAR_AGI_BUCKET),
+ TRANS_TYPE_STR(SB_CHANGE),
+ TRANS_TYPE_STR(DUMMY1),
+ TRANS_TYPE_STR(DUMMY2),
+ TRANS_TYPE_STR(QM_QUOTAOFF),
+ TRANS_TYPE_STR(QM_DQALLOC),
+ TRANS_TYPE_STR(QM_SETQLIM),
+ TRANS_TYPE_STR(QM_DQCLUSTER),
+ TRANS_TYPE_STR(QM_QINOCREATE),
+ TRANS_TYPE_STR(QM_QUOTAOFF_END),
+ TRANS_TYPE_STR(FSYNC_TS),
+ TRANS_TYPE_STR(GROWFSRT_ALLOC),
+ TRANS_TYPE_STR(GROWFSRT_ZERO),
+ TRANS_TYPE_STR(GROWFSRT_FREE),
+ TRANS_TYPE_STR(SWAPEXT),
+ TRANS_TYPE_STR(CHECKPOINT),
+ TRANS_TYPE_STR(ICREATE),
+ TRANS_TYPE_STR(CREATE_TMPFILE)
};
+#undef TRANS_TYPE_STR
xfs_warn(mp, "xlog_write: reservation summary:");
xfs_warn(mp, " trans type = %s (%u)",
((ticket->t_trans_type <= 0 ||
ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
- "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+ "bad-trans-type" : trans_type_str[ticket->t_trans_type]),
ticket->t_trans_type);
xfs_warn(mp, " unit res = %d bytes",
ticket->t_unit_res);
@@ -2076,7 +2101,7 @@ xlog_print_tic_res(
uint r_type = ticket->t_res_arr[i].r_type;
xfs_warn(mp, "region[%u]: %s - %u bytes", i,
((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
- "bad-rtype" : res_type_str[r_type-1]),
+ "bad-rtype" : res_type_str[r_type]),
ticket->t_res_arr[i].r_len);
}
@@ -2422,11 +2447,20 @@ xlog_write(
&partial_copy_len);
xlog_verify_dest_ptr(log, ptr);
- /* copy region */
+ /*
+ * Copy region.
+ *
+ * Unmount records just log an opheader, so can have
+ * empty payloads with no data region to copy. Hence we
+ * only copy the payload if the vector says it has data
+ * to copy.
+ */
ASSERT(copy_len >= 0);
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
-
+ if (copy_len > 0) {
+ memcpy(ptr, reg->i_addr + copy_off, copy_len);
+ xlog_write_adv_cnt(&ptr, &len, &log_offset,
+ copy_len);
+ }
copy_len += start_rec_copy + sizeof(xlog_op_header_t);
record_cnt++;
data_cnt += contwr ? copy_len : 0;
@@ -2782,11 +2816,19 @@ xlog_state_do_callback(
}
} while (!ioerrors && loopdidcallbacks);
+#ifdef DEBUG
/*
- * make one last gasp attempt to see if iclogs are being left in
- * limbo..
+ * Make one last gasp attempt to see if iclogs are being left in limbo.
+ * If the above loop finds an iclog earlier than the current iclog and
+ * in one of the syncing states, the current iclog is put into
+ * DO_CALLBACK and the callbacks are deferred to the completion of the
+ * earlier iclog. Walk the iclogs in order and make sure that no iclog
+ * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
+ * states.
+ *
+ * Note that SYNCING|IOABORT is a valid state so we cannot just check
+ * for ic_state == SYNCING.
*/
-#ifdef DEBUG
if (funcdidcallbacks) {
first_iclog = iclog = log->l_iclog;
do {
@@ -2801,7 +2843,7 @@ xlog_state_do_callback(
* IOERROR - give up hope all ye who enter here
*/
if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- iclog->ic_state == XLOG_STATE_SYNCING ||
+ iclog->ic_state & XLOG_STATE_SYNCING ||
iclog->ic_state == XLOG_STATE_DONE_SYNC ||
iclog->ic_state == XLOG_STATE_IOERROR )
break;
@@ -2913,7 +2955,7 @@ restart:
iclog = log->l_iclog;
if (iclog->ic_state != XLOG_STATE_ACTIVE) {
- XFS_STATS_INC(xs_log_noiclogs);
+ XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
/* Wait for log writes to have flushed */
xlog_wait(&log->l_flush_wait, &log->l_icloglock);
@@ -3165,11 +3207,19 @@ xlog_state_switch_iclogs(
}
if (log->l_curr_block >= log->l_logBBsize) {
+ /*
+ * Rewind the current block before the cycle is bumped to make
+ * sure that the combined LSN never transiently moves forward
+ * when the log wraps to the next cycle. This is to support the
+ * unlocked sample of these fields from xlog_valid_lsn(). Most
+ * other cases should acquire l_icloglock.
+ */
+ log->l_curr_block -= log->l_logBBsize;
+ ASSERT(log->l_curr_block >= 0);
+ smp_wmb();
log->l_curr_cycle++;
if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
log->l_curr_cycle++;
- log->l_curr_block -= log->l_logBBsize;
- ASSERT(log->l_curr_block >= 0);
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
@@ -3212,7 +3262,7 @@ _xfs_log_force(
struct xlog_in_core *iclog;
xfs_lsn_t lsn;
- XFS_STATS_INC(xs_log_force);
+ XFS_STATS_INC(mp, xs_log_force);
xlog_cil_force(log);
@@ -3297,7 +3347,7 @@ maybe_sleep:
spin_unlock(&log->l_icloglock);
return -EIO;
}
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
@@ -3362,7 +3412,7 @@ _xfs_log_force_lsn(
ASSERT(lsn != 0);
- XFS_STATS_INC(xs_log_force);
+ XFS_STATS_INC(mp, xs_log_force);
lsn = xlog_cil_force_lsn(log, lsn);
if (lsn == NULLCOMMITLSN)
@@ -3411,7 +3461,7 @@ try_again:
(XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
@@ -3441,7 +3491,7 @@ try_again:
spin_unlock(&log->l_icloglock);
return -EIO;
}
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
@@ -3929,7 +3979,7 @@ xfs_log_force_umount(
log->l_flags & XLOG_ACTIVE_RECOVERY) {
mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
if (mp->m_sb_bp)
- XFS_BUF_DONE(mp->m_sb_bp);
+ mp->m_sb_bp->b_flags |= XBF_DONE;
return 0;
}
@@ -3959,7 +4009,7 @@ xfs_log_force_umount(
spin_lock(&log->l_icloglock);
mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
if (mp->m_sb_bp)
- XFS_BUF_DONE(mp->m_sb_bp);
+ mp->m_sb_bp->b_flags |= XBF_DONE;
/*
* Mark the log and the iclogs with IO error flags to prevent any
@@ -4023,3 +4073,45 @@ xlog_iclogs_empty(
return 1;
}
+/*
+ * Verify that an LSN stamped into a piece of metadata is valid. This is
+ * intended for use in read verifiers on v5 superblocks.
+ */
+bool
+xfs_log_check_lsn(
+ struct xfs_mount *mp,
+ xfs_lsn_t lsn)
+{
+ struct xlog *log = mp->m_log;
+ bool valid;
+
+ /*
+ * norecovery mode skips mount-time log processing and unconditionally
+ * resets the in-core LSN. We can't validate in this mode, but
+ * modifications are not allowed anyways so just return true.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return true;
+
+ /*
+ * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
+ * handled by recovery and thus safe to ignore here.
+ */
+ if (lsn == NULLCOMMITLSN)
+ return true;
+
+ valid = xlog_valid_lsn(mp->m_log, lsn);
+
+ /* warn the user about what's gone wrong before verifier failure */
+ if (!valid) {
+ spin_lock(&log->l_icloglock);
+ xfs_warn(mp,
+"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
+"Please unmount and run xfs_repair (>= v4.3) to resolve.",
+ CYCLE_LSN(lsn), BLOCK_LSN(lsn),
+ log->l_curr_cycle, log->l_curr_block);
+ spin_unlock(&log->l_icloglock);
+ }
+
+ return valid;
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 09d91d3166cd..aa533a7d50f2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,6 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
void xfs_log_worker(struct work_struct *work);
void xfs_log_quiesce(struct xfs_mount *mp);
+bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 950f3f94720c..ed8896310c00 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -62,6 +62,7 @@ static inline uint xlog_get_client_id(__be32 i)
#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
+#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */
#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
@@ -410,6 +411,8 @@ struct xlog {
/* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG
void *l_iclog_bak[XLOG_MAX_ICLOGS];
+ /* log record crc error injection factor */
+ uint32_t l_badcrc_factor;
#endif
};
@@ -560,4 +563,55 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
remove_wait_queue(wq, &wait);
}
+/*
+ * The LSN is valid so long as it is behind the current LSN. If it isn't, this
+ * means that the next log record that includes this metadata could have a
+ * smaller LSN. In turn, this means that the modification in the log would not
+ * replay.
+ */
+static inline bool
+xlog_valid_lsn(
+ struct xlog *log,
+ xfs_lsn_t lsn)
+{
+ int cur_cycle;
+ int cur_block;
+ bool valid = true;
+
+ /*
+ * First, sample the current lsn without locking to avoid added
+ * contention from metadata I/O. The current cycle and block are updated
+ * (in xlog_state_switch_iclogs()) and read here in a particular order
+ * to avoid false negatives (e.g., thinking the metadata LSN is valid
+ * when it is not).
+ *
+ * The current block is always rewound before the cycle is bumped in
+ * xlog_state_switch_iclogs() to ensure the current LSN is never seen in
+ * a transiently forward state. Instead, we can see the LSN in a
+ * transiently behind state if we happen to race with a cycle wrap.
+ */
+ cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+ smp_rmb();
+ cur_block = ACCESS_ONCE(log->l_curr_block);
+
+ if ((CYCLE_LSN(lsn) > cur_cycle) ||
+ (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
+ /*
+ * If the metadata LSN appears invalid, it's possible the check
+ * above raced with a wrap to the next log cycle. Grab the lock
+ * to check for sure.
+ */
+ spin_lock(&log->l_icloglock);
+ cur_cycle = log->l_curr_cycle;
+ cur_block = log->l_curr_block;
+ spin_unlock(&log->l_icloglock);
+
+ if ((CYCLE_LSN(lsn) > cur_cycle) ||
+ (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
+ valid = false;
+ }
+
+ return valid;
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 512a0945d52a..396565f43247 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -61,6 +61,9 @@ xlog_recover_check_summary(
#else
#define xlog_recover_check_summary(log)
#endif
+STATIC int
+xlog_do_recovery_pass(
+ struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
/*
* This structure is used during recovery to record the buf log items which
@@ -187,7 +190,7 @@ xlog_bread_noalign(
ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
- XFS_BUF_READ(bp);
+ bp->b_flags |= XBF_READ;
bp->b_io_length = nbblks;
bp->b_error = 0;
@@ -272,7 +275,6 @@ xlog_bwrite(
ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
- XFS_BUF_ZEROFLAGS(bp);
xfs_buf_hold(bp);
xfs_buf_lock(bp);
bp->b_io_length = nbblks;
@@ -868,136 +870,365 @@ validate_head:
}
/*
- * Find the sync block number or the tail of the log.
- *
- * This will be the block number of the last record to have its
- * associated buffers synced to disk. Every log record header has
- * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
- * to get a sync block number. The only concern is to figure out which
- * log record header to believe.
- *
- * The following algorithm uses the log record header with the largest
- * lsn. The entire log record does not need to be valid. We only care
- * that the header is valid.
+ * Seek backwards in the log for log record headers.
*
- * We could speed up search by using current head_blk buffer, but it is not
- * available.
+ * Given a starting log block, walk backwards until we find the provided number
+ * of records or hit the provided tail block. The return value is the number of
+ * records encountered or a negative error code. The log block and buffer
+ * pointer of the last record seen are returned in rblk and rhead respectively.
*/
STATIC int
-xlog_find_tail(
+xlog_rseek_logrec_hdr(
struct xlog *log,
- xfs_daddr_t *head_blk,
- xfs_daddr_t *tail_blk)
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk,
+ int count,
+ struct xfs_buf *bp,
+ xfs_daddr_t *rblk,
+ struct xlog_rec_header **rhead,
+ bool *wrapped)
{
- xlog_rec_header_t *rhead;
- xlog_op_header_t *op_head;
+ int i;
+ int error;
+ int found = 0;
char *offset = NULL;
- xfs_buf_t *bp;
- int error, i, found;
- xfs_daddr_t umount_data_blk;
- xfs_daddr_t after_umount_blk;
- xfs_lsn_t tail_lsn;
- int hblks;
+ xfs_daddr_t end_blk;
- found = 0;
+ *wrapped = false;
/*
- * Find previous log record
+ * Walk backwards from the head block until we hit the tail or the first
+ * block in the log.
*/
- if ((error = xlog_find_head(log, head_blk)))
- return error;
-
- bp = xlog_get_bp(log, 1);
- if (!bp)
- return -ENOMEM;
- if (*head_blk == 0) { /* special case */
- error = xlog_bread(log, 0, 1, bp, &offset);
+ end_blk = head_blk > tail_blk ? tail_blk : 0;
+ for (i = (int) head_blk - 1; i >= end_blk; i--) {
+ error = xlog_bread(log, i, 1, bp, &offset);
if (error)
- goto done;
+ goto out_error;
- if (xlog_get_cycle(offset) == 0) {
- *tail_blk = 0;
- /* leave all other log inited values alone */
- goto done;
+ if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
}
}
/*
- * Search backwards looking for log record header block
+ * If we haven't hit the tail block or the log record header count,
+ * start looking again from the end of the physical log. Note that
+ * callers can pass head == tail if the tail is not yet known.
*/
- ASSERT(*head_blk < INT_MAX);
- for (i = (int)(*head_blk) - 1; i >= 0; i--) {
+ if (tail_blk >= head_blk && found != count) {
+ for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
+ error = xlog_bread(log, i, 1, bp, &offset);
+ if (error)
+ goto out_error;
+
+ if (*(__be32 *)offset ==
+ cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *wrapped = true;
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
+ }
+ }
+ }
+
+ return found;
+
+out_error:
+ return error;
+}
+
+/*
+ * Seek forward in the log for log record headers.
+ *
+ * Given head and tail blocks, walk forward from the tail block until we find
+ * the provided number of records or hit the head block. The return value is the
+ * number of records encountered or a negative error code. The log block and
+ * buffer pointer of the last record seen are returned in rblk and rhead
+ * respectively.
+ */
+STATIC int
+xlog_seek_logrec_hdr(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk,
+ int count,
+ struct xfs_buf *bp,
+ xfs_daddr_t *rblk,
+ struct xlog_rec_header **rhead,
+ bool *wrapped)
+{
+ int i;
+ int error;
+ int found = 0;
+ char *offset = NULL;
+ xfs_daddr_t end_blk;
+
+ *wrapped = false;
+
+ /*
+ * Walk forward from the tail block until we hit the head or the last
+ * block in the log.
+ */
+ end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
+ for (i = (int) tail_blk; i <= end_blk; i++) {
error = xlog_bread(log, i, 1, bp, &offset);
if (error)
- goto done;
+ goto out_error;
- if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
- found = 1;
- break;
+ if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
}
}
+
/*
- * If we haven't found the log record header block, start looking
- * again from the end of the physical log. XXXmiken: There should be
- * a check here to make sure we didn't search more than N blocks in
- * the previous code.
+ * If we haven't hit the head block or the log record header count,
+ * start looking again from the start of the physical log.
*/
- if (!found) {
- for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
+ if (tail_blk > head_blk && found != count) {
+ for (i = 0; i < (int) head_blk; i++) {
error = xlog_bread(log, i, 1, bp, &offset);
if (error)
- goto done;
+ goto out_error;
if (*(__be32 *)offset ==
cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
- found = 2;
- break;
+ *wrapped = true;
+ *rblk = i;
+ *rhead = (struct xlog_rec_header *) offset;
+ if (++found == count)
+ break;
}
}
}
- if (!found) {
- xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
- xlog_put_bp(bp);
- ASSERT(0);
- return -EIO;
+
+ return found;
+
+out_error:
+ return error;
+}
+
+/*
+ * Check the log tail for torn writes. This is required when torn writes are
+ * detected at the head and the head had to be walked back to a previous record.
+ * The tail of the previous record must now be verified to ensure the torn
+ * writes didn't corrupt the previous tail.
+ *
+ * Return an error if CRC verification fails as recovery cannot proceed.
+ */
+STATIC int
+xlog_verify_tail(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ xfs_daddr_t tail_blk)
+{
+ struct xlog_rec_header *thead;
+ struct xfs_buf *bp;
+ xfs_daddr_t first_bad;
+ int count;
+ int error = 0;
+ bool wrapped;
+ xfs_daddr_t tmp_head;
+
+ bp = xlog_get_bp(log, 1);
+ if (!bp)
+ return -ENOMEM;
+
+ /*
+ * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
+ * a temporary head block that points after the last possible
+ * concurrently written record of the tail.
+ */
+ count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
+ XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
+ &wrapped);
+ if (count < 0) {
+ error = count;
+ goto out;
}
- /* find blk_no of tail of log */
- rhead = (xlog_rec_header_t *)offset;
- *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+ /*
+ * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
+ * into the actual log head. tmp_head points to the start of the record
+ * so update it to the actual head block.
+ */
+ if (count < XLOG_MAX_ICLOGS + 1)
+ tmp_head = head_blk;
/*
- * Reset log values according to the state of the log when we
- * crashed. In the case where head_blk == 0, we bump curr_cycle
- * one because the next write starts a new cycle rather than
- * continuing the cycle of the last good log record. At this
- * point we have guaranteed that all partial log records have been
- * accounted for. Therefore, we know that the last good log record
- * written was complete and ended exactly on the end boundary
- * of the physical log.
+ * We now have a tail and temporary head block that covers at least
+ * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
+ * records were completely written. Run a CRC verification pass from
+ * tail to head and return the result.
*/
- log->l_prev_block = i;
- log->l_curr_block = (int)*head_blk;
- log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
- if (found == 2)
- log->l_curr_cycle++;
- atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
- atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
- xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
- BBTOB(log->l_curr_block));
- xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
- BBTOB(log->l_curr_block));
+ error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+
+out:
+ xlog_put_bp(bp);
+ return error;
+}
+
+/*
+ * Detect and trim torn writes from the head of the log.
+ *
+ * Storage without sector atomicity guarantees can result in torn writes in the
+ * log in the event of a crash. Our only means to detect this scenario is via
+ * CRC verification. While we can't always be certain that CRC verification
+ * failure is due to a torn write vs. an unrelated corruption, we do know that
+ * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
+ * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
+ * the log and treat failures in this range as torn writes as a matter of
+ * policy. In the event of CRC failure, the head is walked back to the last good
+ * record in the log and the tail is updated from that record and verified.
+ */
+STATIC int
+xlog_verify_head(
+ struct xlog *log,
+ xfs_daddr_t *head_blk, /* in/out: unverified head */
+ xfs_daddr_t *tail_blk, /* out: tail block */
+ struct xfs_buf *bp,
+ xfs_daddr_t *rhead_blk, /* start blk of last record */
+ struct xlog_rec_header **rhead, /* ptr to last record */
+ bool *wrapped) /* last rec. wraps phys. log */
+{
+ struct xlog_rec_header *tmp_rhead;
+ struct xfs_buf *tmp_bp;
+ xfs_daddr_t first_bad;
+ xfs_daddr_t tmp_rhead_blk;
+ int found;
+ int error;
+ bool tmp_wrapped;
+
+ /*
+ * Check the head of the log for torn writes. Search backwards from the
+ * head until we hit the tail or the maximum number of log record I/Os
+ * that could have been in flight at one time. Use a temporary buffer so
+ * we don't trash the rhead/bp pointers from the caller.
+ */
+ tmp_bp = xlog_get_bp(log, 1);
+ if (!tmp_bp)
+ return -ENOMEM;
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
+ XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
+ &tmp_rhead, &tmp_wrapped);
+ xlog_put_bp(tmp_bp);
+ if (error < 0)
+ return error;
+
+ /*
+ * Now run a CRC verification pass over the records starting at the
+ * block found above to the current head. If a CRC failure occurs, the
+ * log block of the first bad record is saved in first_bad.
+ */
+ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+ XLOG_RECOVER_CRCPASS, &first_bad);
+ if (error == -EFSBADCRC) {
+ /*
+ * We've hit a potential torn write. Reset the error and warn
+ * about it.
+ */
+ error = 0;
+ xfs_warn(log->l_mp,
+"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
+ first_bad, *head_blk);
+
+ /*
+ * Get the header block and buffer pointer for the last good
+ * record before the bad record.
+ *
+ * Note that xlog_find_tail() clears the blocks at the new head
+ * (i.e., the records with invalid CRC) if the cycle number
+ * matches the the current cycle.
+ */
+ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
+ rhead_blk, rhead, wrapped);
+ if (found < 0)
+ return found;
+ if (found == 0) /* XXX: right thing to do here? */
+ return -EIO;
+
+ /*
+ * Reset the head block to the starting block of the first bad
+ * log record and set the tail block based on the last good
+ * record.
+ *
+ * Bail out if the updated head/tail match as this indicates
+ * possible corruption outside of the acceptable
+ * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
+ */
+ *head_blk = first_bad;
+ *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
+ if (*head_blk == *tail_blk) {
+ ASSERT(0);
+ return 0;
+ }
+
+ /*
+ * Now verify the tail based on the updated head. This is
+ * required because the torn writes trimmed from the head could
+ * have been written over the tail of a previous record. Return
+ * any errors since recovery cannot proceed if the tail is
+ * corrupt.
+ *
+ * XXX: This leaves a gap in truly robust protection from torn
+ * writes in the log. If the head is behind the tail, the tail
+ * pushes forward to create some space and then a crash occurs
+ * causing the writes into the previous record's tail region to
+ * tear, log recovery isn't able to recover.
+ *
+ * How likely is this to occur? If possible, can we do something
+ * more intelligent here? Is it safe to push the tail forward if
+ * we can determine that the tail is within the range of the
+ * torn write (e.g., the kernel can only overwrite the tail if
+ * it has actually been pushed forward)? Alternatively, could we
+ * somehow prevent this condition at runtime?
+ */
+ error = xlog_verify_tail(log, *head_blk, *tail_blk);
+ }
+
+ return error;
+}
+
+/*
+ * Check whether the head of the log points to an unmount record. In other
+ * words, determine whether the log is clean. If so, update the in-core state
+ * appropriately.
+ */
+static int
+xlog_check_unmount_rec(
+ struct xlog *log,
+ xfs_daddr_t *head_blk,
+ xfs_daddr_t *tail_blk,
+ struct xlog_rec_header *rhead,
+ xfs_daddr_t rhead_blk,
+ struct xfs_buf *bp,
+ bool *clean)
+{
+ struct xlog_op_header *op_head;
+ xfs_daddr_t umount_data_blk;
+ xfs_daddr_t after_umount_blk;
+ int hblks;
+ int error;
+ char *offset;
+
+ *clean = false;
/*
- * Look for unmount record. If we find it, then we know there
- * was a clean unmount. Since 'i' could be the last block in
- * the physical log, we convert to a log block before comparing
- * to the head_blk.
+ * Look for unmount record. If we find it, then we know there was a
+ * clean unmount. Since 'i' could be the last block in the physical
+ * log, we convert to a log block before comparing to the head_blk.
*
- * Save the current tail lsn to use to pass to
- * xlog_clear_stale_blocks() below. We won't want to clear the
- * unmount record if there is one, so we pass the lsn of the
- * unmount record rather than the block after it.
+ * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
+ * below. We won't want to clear the unmount record if there is one, so
+ * we pass the lsn of the unmount record rather than the block after it.
*/
if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
int h_size = be32_to_cpu(rhead->h_size);
@@ -1014,22 +1245,22 @@ xlog_find_tail(
} else {
hblks = 1;
}
- after_umount_blk = (i + hblks + (int)
- BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
- tail_lsn = atomic64_read(&log->l_tail_lsn);
+ after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+ after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
if (*head_blk == after_umount_blk &&
be32_to_cpu(rhead->h_num_logops) == 1) {
- umount_data_blk = (i + hblks) % log->l_logBBsize;
+ umount_data_blk = rhead_blk + hblks;
+ umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
if (error)
- goto done;
+ return error;
- op_head = (xlog_op_header_t *)offset;
+ op_head = (struct xlog_op_header *)offset;
if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
/*
- * Set tail and last sync so that newly written
- * log records will point recovery to after the
- * current unmount record.
+ * Set tail and last sync so that newly written log
+ * records will point recovery to after the current
+ * unmount record.
*/
xlog_assign_atomic_lsn(&log->l_tail_lsn,
log->l_curr_cycle, after_umount_blk);
@@ -1037,17 +1268,167 @@ xlog_find_tail(
log->l_curr_cycle, after_umount_blk);
*tail_blk = after_umount_blk;
- /*
- * Note that the unmount was clean. If the unmount
- * was not clean, we need to know this to rebuild the
- * superblock counters from the perag headers if we
- * have a filesystem using non-persistent counters.
- */
- log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+ *clean = true;
+ }
+ }
+
+ return 0;
+}
+
+static void
+xlog_set_state(
+ struct xlog *log,
+ xfs_daddr_t head_blk,
+ struct xlog_rec_header *rhead,
+ xfs_daddr_t rhead_blk,
+ bool bump_cycle)
+{
+ /*
+ * Reset log values according to the state of the log when we
+ * crashed. In the case where head_blk == 0, we bump curr_cycle
+ * one because the next write starts a new cycle rather than
+ * continuing the cycle of the last good log record. At this
+ * point we have guaranteed that all partial log records have been
+ * accounted for. Therefore, we know that the last good log record
+ * written was complete and ended exactly on the end boundary
+ * of the physical log.
+ */
+ log->l_prev_block = rhead_blk;
+ log->l_curr_block = (int)head_blk;
+ log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+ if (bump_cycle)
+ log->l_curr_cycle++;
+ atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+ atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+ xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
+ xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
+}
+
+/*
+ * Find the sync block number or the tail of the log.
+ *
+ * This will be the block number of the last record to have its
+ * associated buffers synced to disk. Every log record header has
+ * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
+ * to get a sync block number. The only concern is to figure out which
+ * log record header to believe.
+ *
+ * The following algorithm uses the log record header with the largest
+ * lsn. The entire log record does not need to be valid. We only care
+ * that the header is valid.
+ *
+ * We could speed up search by using current head_blk buffer, but it is not
+ * available.
+ */
+STATIC int
+xlog_find_tail(
+ struct xlog *log,
+ xfs_daddr_t *head_blk,
+ xfs_daddr_t *tail_blk)
+{
+ xlog_rec_header_t *rhead;
+ char *offset = NULL;
+ xfs_buf_t *bp;
+ int error;
+ xfs_daddr_t rhead_blk;
+ xfs_lsn_t tail_lsn;
+ bool wrapped = false;
+ bool clean = false;
+
+ /*
+ * Find previous log record
+ */
+ if ((error = xlog_find_head(log, head_blk)))
+ return error;
+ ASSERT(*head_blk < INT_MAX);
+
+ bp = xlog_get_bp(log, 1);
+ if (!bp)
+ return -ENOMEM;
+ if (*head_blk == 0) { /* special case */
+ error = xlog_bread(log, 0, 1, bp, &offset);
+ if (error)
+ goto done;
+
+ if (xlog_get_cycle(offset) == 0) {
+ *tail_blk = 0;
+ /* leave all other log inited values alone */
+ goto done;
+ }
+ }
+
+ /*
+ * Search backwards through the log looking for the log record header
+ * block. This wraps all the way back around to the head so something is
+ * seriously wrong if we can't find it.
+ */
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+ &rhead_blk, &rhead, &wrapped);
+ if (error < 0)
+ return error;
+ if (!error) {
+ xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+ return -EIO;
+ }
+ *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+
+ /*
+ * Set the log state based on the current head record.
+ */
+ xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
+ tail_lsn = atomic64_read(&log->l_tail_lsn);
+
+ /*
+ * Look for an unmount record at the head of the log. This sets the log
+ * state to determine whether recovery is necessary.
+ */
+ error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
+ rhead_blk, bp, &clean);
+ if (error)
+ goto done;
+
+ /*
+ * Verify the log head if the log is not clean (e.g., we have anything
+ * but an unmount record at the head). This uses CRC verification to
+ * detect and trim torn writes. If discovered, CRC failures are
+ * considered torn writes and the log head is trimmed accordingly.
+ *
+ * Note that we can only run CRC verification when the log is dirty
+ * because there's no guarantee that the log data behind an unmount
+ * record is compatible with the current architecture.
+ */
+ if (!clean) {
+ xfs_daddr_t orig_head = *head_blk;
+
+ error = xlog_verify_head(log, head_blk, tail_blk, bp,
+ &rhead_blk, &rhead, &wrapped);
+ if (error)
+ goto done;
+
+ /* update in-core state again if the head changed */
+ if (*head_blk != orig_head) {
+ xlog_set_state(log, *head_blk, rhead, rhead_blk,
+ wrapped);
+ tail_lsn = atomic64_read(&log->l_tail_lsn);
+ error = xlog_check_unmount_rec(log, head_blk, tail_blk,
+ rhead, rhead_blk, bp,
+ &clean);
+ if (error)
+ goto done;
}
}
/*
+ * Note that the unmount was clean. If the unmount was not clean, we
+ * need to know this to rebuild the superblock counters from the perag
+ * headers if we have a filesystem using non-persistent counters.
+ */
+ if (clean)
+ log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+
+ /*
* Make sure that there are no blocks in front of the head
* with the same cycle number as the head. This can happen
* because we allow multiple outstanding log writes concurrently,
@@ -2156,6 +2537,13 @@ xlog_recover_validate_buf_type(
}
bp->b_ops = &xfs_sb_buf_ops;
break;
+#ifdef CONFIG_XFS_RT
+ case XFS_BLFT_RTBITMAP_BUF:
+ case XFS_BLFT_RTSUMMARY_BUF:
+ /* no magic numbers for verification of RT buffers */
+ bp->b_ops = &xfs_rtbuf_ops;
+ break;
+#endif /* CONFIG_XFS_RT */
default:
xfs_warn(mp, "Unknown buffer type %d!",
xfs_blft_from_flags(buf_f));
@@ -2476,7 +2864,7 @@ xfs_recover_inode_owner_change(
return -ENOMEM;
/* instantiate the inode */
- xfs_dinode_from_disk(&ip->i_d, dip);
+ xfs_inode_from_disk(ip, dip);
ASSERT(ip->i_d.di_version >= 3);
error = xfs_iformat_fork(ip, dip);
@@ -2522,7 +2910,7 @@ xlog_recover_inode_pass2(
int error;
int attr_index;
uint fields;
- xfs_icdinode_t *dicp;
+ struct xfs_log_dinode *ldip;
uint isize;
int need_free = 0;
@@ -2575,8 +2963,8 @@ xlog_recover_inode_pass2(
error = -EFSCORRUPTED;
goto out_release;
}
- dicp = item->ri_buf[1].i_addr;
- if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
+ ldip = item->ri_buf[1].i_addr;
+ if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
__func__, item, in_f->ilf_ino);
@@ -2612,13 +3000,13 @@ xlog_recover_inode_pass2(
* to skip replay when the on disk inode is newer than the log one
*/
if (!xfs_sb_version_hascrc(&mp->m_sb) &&
- dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
* than smaller numbers
*/
if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
- dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
+ ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
/* do nothing */
} else {
trace_xfs_log_recover_inode_skip(log, in_f);
@@ -2628,13 +3016,13 @@ xlog_recover_inode_pass2(
}
/* Take the opportunity to reset the flush iteration count */
- dicp->di_flushiter = 0;
+ ldip->di_flushiter = 0;
- if (unlikely(S_ISREG(dicp->di_mode))) {
- if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
- (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
+ if (unlikely(S_ISREG(ldip->di_mode))) {
+ if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
- XFS_ERRLEVEL_LOW, mp, dicp);
+ XFS_ERRLEVEL_LOW, mp, ldip);
xfs_alert(mp,
"%s: Bad regular inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
@@ -2642,12 +3030,12 @@ xlog_recover_inode_pass2(
error = -EFSCORRUPTED;
goto out_release;
}
- } else if (unlikely(S_ISDIR(dicp->di_mode))) {
- if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
- (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
- (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
+ } else if (unlikely(S_ISDIR(ldip->di_mode))) {
+ if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
+ (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
+ (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
- XFS_ERRLEVEL_LOW, mp, dicp);
+ XFS_ERRLEVEL_LOW, mp, ldip);
xfs_alert(mp,
"%s: Bad dir inode log record, rec ptr 0x%p, "
"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
@@ -2656,32 +3044,32 @@ xlog_recover_inode_pass2(
goto out_release;
}
}
- if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
+ if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
- XFS_ERRLEVEL_LOW, mp, dicp);
+ XFS_ERRLEVEL_LOW, mp, ldip);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
__func__, item, dip, bp, in_f->ilf_ino,
- dicp->di_nextents + dicp->di_anextents,
- dicp->di_nblocks);
+ ldip->di_nextents + ldip->di_anextents,
+ ldip->di_nblocks);
error = -EFSCORRUPTED;
goto out_release;
}
- if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
+ if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
- XFS_ERRLEVEL_LOW, mp, dicp);
+ XFS_ERRLEVEL_LOW, mp, ldip);
xfs_alert(mp,
"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
- item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
+ item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
error = -EFSCORRUPTED;
goto out_release;
}
- isize = xfs_icdinode_size(dicp->di_version);
+ isize = xfs_log_dinode_size(ldip->di_version);
if (unlikely(item->ri_buf[1].i_len > isize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
- XFS_ERRLEVEL_LOW, mp, dicp);
+ XFS_ERRLEVEL_LOW, mp, ldip);
xfs_alert(mp,
"%s: Bad inode log record length %d, rec ptr 0x%p",
__func__, item->ri_buf[1].i_len, item);
@@ -2689,8 +3077,8 @@ xlog_recover_inode_pass2(
goto out_release;
}
- /* The core is in in-core format */
- xfs_dinode_to_disk(dip, dicp);
+ /* recover the log dinode inode into the on disk inode */
+ xfs_log_dinode_to_disk(ldip, dip);
/* the rest is in on-disk format */
if (item->ri_buf[1].i_len > isize) {
@@ -3204,6 +3592,7 @@ xlog_recover_dquot_ra_pass2(
struct xfs_disk_dquot *recddq;
struct xfs_dq_logformat *dq_f;
uint type;
+ int len;
if (mp->m_qflags == 0)
@@ -3224,8 +3613,12 @@ xlog_recover_dquot_ra_pass2(
ASSERT(dq_f);
ASSERT(dq_f->qlf_len == 1);
- xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+ len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
+ if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
+ return;
+
+ xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
+ &xfs_dquot_buf_ra_ops);
}
STATIC void
@@ -3431,7 +3824,7 @@ xlog_recover_add_to_cont_trans(
* previous record. Copy the rest of the header.
*/
if (list_empty(&trans->r_itemq)) {
- ASSERT(len < sizeof(struct xfs_trans_header));
+ ASSERT(len <= sizeof(struct xfs_trans_header));
if (len > sizeof(struct xfs_trans_header)) {
xfs_warn(log->l_mp, "%s: bad header length", __func__);
return -EIO;
@@ -4015,8 +4408,8 @@ xlog_recover_process_one_iunlink(
if (error)
goto fail_iput;
- ASSERT(ip->i_d.di_nlink == 0);
- ASSERT(ip->i_d.di_mode != 0);
+ ASSERT(VFS_I(ip)->i_nlink == 0);
+ ASSERT(VFS_I(ip)->i_mode != 0);
/* setup for the next pass */
agino = be32_to_cpu(dip->di_next_unlinked);
@@ -4118,25 +4511,68 @@ xlog_recover_process_iunlinks(
mp->m_dmevmask = mp_dmevmask;
}
+STATIC int
+xlog_unpack_data(
+ struct xlog_rec_header *rhead,
+ char *dp,
+ struct xlog *log)
+{
+ int i, j, k;
+
+ for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
+ i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+ *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
+ dp += BBSIZE;
+ }
+
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
+ for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
+ j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+ dp += BBSIZE;
+ }
+ }
+
+ return 0;
+}
+
/*
- * Upack the log buffer data and crc check it. If the check fails, issue a
- * warning if and only if the CRC in the header is non-zero. This makes the
- * check an advisory warning, and the zero CRC check will prevent failure
- * warnings from being emitted when upgrading the kernel from one that does not
- * add CRCs by default.
- *
- * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
- * corruption failure
+ * CRC check, unpack and process a log record.
*/
STATIC int
-xlog_unpack_data_crc(
+xlog_recover_process(
+ struct xlog *log,
+ struct hlist_head rhash[],
struct xlog_rec_header *rhead,
char *dp,
- struct xlog *log)
+ int pass)
{
+ int error;
__le32 crc;
crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+
+ /*
+ * Nothing else to do if this is a CRC verification pass. Just return
+ * if this a record with a non-zero crc. Unfortunately, mkfs always
+ * sets h_crc to 0 so we must consider this valid even on v5 supers.
+ * Otherwise, return EFSBADCRC on failure so the callers up the stack
+ * know precisely what failed.
+ */
+ if (pass == XLOG_RECOVER_CRCPASS) {
+ if (rhead->h_crc && crc != rhead->h_crc)
+ return -EFSBADCRC;
+ return 0;
+ }
+
+ /*
+ * We're in the normal recovery path. Issue a warning if and only if the
+ * CRC in the header is non-zero. This is an advisory warning and the
+ * zero CRC check prevents warnings from being emitted when upgrading
+ * the kernel from one that does not add CRCs by default.
+ */
if (crc != rhead->h_crc) {
if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
xfs_alert(log->l_mp,
@@ -4147,47 +4583,18 @@ xlog_unpack_data_crc(
}
/*
- * If we've detected a log record corruption, then we can't
- * recover past this point. Abort recovery if we are enforcing
- * CRC protection by punting an error back up the stack.
+ * If the filesystem is CRC enabled, this mismatch becomes a
+ * fatal log corruption failure.
*/
if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
return -EFSCORRUPTED;
}
- return 0;
-}
-
-STATIC int
-xlog_unpack_data(
- struct xlog_rec_header *rhead,
- char *dp,
- struct xlog *log)
-{
- int i, j, k;
- int error;
-
- error = xlog_unpack_data_crc(rhead, dp, log);
+ error = xlog_unpack_data(rhead, dp, log);
if (error)
return error;
- for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
- i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
- *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
- dp += BBSIZE;
- }
-
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
- for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
- j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
- dp += BBSIZE;
- }
- }
-
- return 0;
+ return xlog_recover_process_data(log, rhash, rhead, dp, pass);
}
STATIC int
@@ -4239,18 +4646,21 @@ xlog_do_recovery_pass(
struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk,
- int pass)
+ int pass,
+ xfs_daddr_t *first_bad) /* out: first bad log rec */
{
xlog_rec_header_t *rhead;
xfs_daddr_t blk_no;
+ xfs_daddr_t rhead_blk;
char *offset;
xfs_buf_t *hbp, *dbp;
- int error = 0, h_size;
+ int error = 0, h_size, h_len;
int bblks, split_bblks;
int hblks, split_hblks, wrapped_hblks;
struct hlist_head rhash[XLOG_RHASH_SIZE];
ASSERT(head_blk != tail_blk);
+ rhead_blk = 0;
/*
* Read the header of the tail block and get the iclog buffer size from
@@ -4274,7 +4684,31 @@ xlog_do_recovery_pass(
error = xlog_valid_rec_header(log, rhead, tail_blk);
if (error)
goto bread_err1;
+
+ /*
+ * xfsprogs has a bug where record length is based on lsunit but
+ * h_size (iclog size) is hardcoded to 32k. Now that we
+ * unconditionally CRC verify the unmount record, this means the
+ * log buffer can be too small for the record and cause an
+ * overrun.
+ *
+ * Detect this condition here. Use lsunit for the buffer size as
+ * long as this looks like the mkfs case. Otherwise, return an
+ * error to avoid a buffer overrun.
+ */
h_size = be32_to_cpu(rhead->h_size);
+ h_len = be32_to_cpu(rhead->h_len);
+ if (h_len > h_size) {
+ if (h_len <= log->l_mp->m_logbsize &&
+ be32_to_cpu(rhead->h_num_logops) == 1) {
+ xfs_warn(log->l_mp,
+ "invalid iclog size (%d bytes), using lsunit (%d bytes)",
+ h_size, log->l_mp->m_logbsize);
+ h_size = log->l_mp->m_logbsize;
+ } else
+ return -EFSCORRUPTED;
+ }
+
if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
(h_size > XLOG_HEADER_CYCLE_SIZE)) {
hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
@@ -4301,7 +4735,7 @@ xlog_do_recovery_pass(
}
memset(rhash, 0, sizeof(rhash));
- blk_no = tail_blk;
+ blk_no = rhead_blk = tail_blk;
if (tail_blk > head_blk) {
/*
* Perform recovery around the end of the physical log.
@@ -4408,19 +4842,18 @@ xlog_do_recovery_pass(
goto bread_err2;
}
- error = xlog_unpack_data(rhead, offset, log);
+ error = xlog_recover_process(log, rhash, rhead, offset,
+ pass);
if (error)
goto bread_err2;
- error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass);
- if (error)
- goto bread_err2;
blk_no += bblks;
+ rhead_blk = blk_no;
}
ASSERT(blk_no >= log->l_logBBsize);
blk_no -= log->l_logBBsize;
+ rhead_blk = blk_no;
}
/* read first part of physical log */
@@ -4441,21 +4874,22 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- error = xlog_unpack_data(rhead, offset, log);
+ error = xlog_recover_process(log, rhash, rhead, offset, pass);
if (error)
goto bread_err2;
- error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass);
- if (error)
- goto bread_err2;
blk_no += bblks + hblks;
+ rhead_blk = blk_no;
}
bread_err2:
xlog_put_bp(dbp);
bread_err1:
xlog_put_bp(hbp);
+
+ if (error && first_bad)
+ *first_bad = rhead_blk;
+
return error;
}
@@ -4493,7 +4927,7 @@ xlog_do_log_recovery(
INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
- XLOG_RECOVER_PASS1);
+ XLOG_RECOVER_PASS1, NULL);
if (error != 0) {
kmem_free(log->l_buf_cancel_table);
log->l_buf_cancel_table = NULL;
@@ -4504,7 +4938,7 @@ xlog_do_log_recovery(
* When it is complete free the table of buf cancel items.
*/
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
- XLOG_RECOVER_PASS2);
+ XLOG_RECOVER_PASS2, NULL);
#ifdef DEBUG
if (!error) {
int i;
@@ -4529,6 +4963,7 @@ xlog_do_recover(
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
+ struct xfs_mount *mp = log->l_mp;
int error;
xfs_buf_t *bp;
xfs_sb_t *sbp;
@@ -4543,7 +4978,7 @@ xlog_do_recover(
/*
* If IO errors happened during recovery, bail out.
*/
- if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ if (XFS_FORCED_SHUTDOWN(mp)) {
return -EIO;
}
@@ -4556,22 +4991,21 @@ xlog_do_recover(
* or iunlinks they will have some entries in the AIL; so we look at
* the AIL to determine how to set the tail_lsn.
*/
- xlog_assign_tail_lsn(log->l_mp);
+ xlog_assign_tail_lsn(mp);
/*
* Now that we've finished replaying all buffer and inode
* updates, re-read in the superblock and reverify it.
*/
- bp = xfs_getsb(log->l_mp, 0);
- XFS_BUF_UNDONE(bp);
- ASSERT(!(XFS_BUF_ISWRITE(bp)));
- XFS_BUF_READ(bp);
- XFS_BUF_UNASYNC(bp);
+ bp = xfs_getsb(mp, 0);
+ bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
+ ASSERT(!(bp->b_flags & XBF_WRITE));
+ bp->b_flags |= XBF_READ;
bp->b_ops = &xfs_sb_buf_ops;
error = xfs_buf_submit_wait(bp);
if (error) {
- if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
xfs_buf_ioerror_alert(bp, __func__);
ASSERT(0);
}
@@ -4580,14 +5014,17 @@ xlog_do_recover(
}
/* Convert superblock from on-disk format */
- sbp = &log->l_mp->m_sb;
+ sbp = &mp->m_sb;
xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
- ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
- ASSERT(xfs_sb_good_version(sbp));
- xfs_reinit_percpu_counters(log->l_mp);
-
xfs_buf_relse(bp);
+ /* re-initialise in-core superblock and geometry structures */
+ xfs_reinit_percpu_counters(mp);
+ error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+ if (error) {
+ xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
+ return error;
+ }
xlog_recover_check_summary(log);
@@ -4609,9 +5046,19 @@ xlog_recover(
int error;
/* find the tail of the log */
- if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
+ error = xlog_find_tail(log, &head_blk, &tail_blk);
+ if (error)
return error;
+ /*
+ * The superblock was read before the log was available and thus the LSN
+ * could not be verified. Check the superblock LSN against the current
+ * LSN now that it's known.
+ */
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
+ !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
+ return -EINVAL;
+
if (tail_blk != head_blk) {
/* There used to be a comment here:
*
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index d8b67547ab34..11792d888e4e 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,6 +17,7 @@
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_error.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -43,6 +44,7 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \
{ \
struct va_format vaf; \
va_list args; \
+ int level; \
\
va_start(args, fmt); \
\
@@ -51,6 +53,11 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \
\
__xfs_printk(kern_level, mp, &vaf); \
va_end(args); \
+ \
+ if (!kstrtoint(kern_level, 0, &level) && \
+ level <= LOGLEVEL_ERR && \
+ xfs_error_level >= XFS_ERRLEVEL_HIGH) \
+ xfs_stack_trace(); \
} \
define_xfs_printk_level(xfs_emerg, KERN_EMERG);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bf92e0c037c7..cfd4210dd015 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -47,6 +47,16 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
static uuid_t *xfs_uuid_table;
+void
+xfs_uuid_table_free(void)
+{
+ if (xfs_uuid_table_size == 0)
+ return;
+ kmem_free(xfs_uuid_table);
+ xfs_uuid_table = NULL;
+ xfs_uuid_table_size = 0;
+}
+
/*
* See if the UUID is unique among mounted XFS filesystems.
* Mount fails if UUID is nil or a FS with the same UUID is already mounted.
@@ -161,7 +171,7 @@ xfs_sb_validate_fsb_count(
ASSERT(sbp->sb_blocklog >= BBSHIFT);
/* Limited by ULONG_MAX of page cache index */
- if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
+ if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
return -EFBIG;
return 0;
}
@@ -175,9 +185,6 @@ xfs_initialize_perag(
xfs_agnumber_t index;
xfs_agnumber_t first_initialised = 0;
xfs_perag_t *pag;
- xfs_agino_t agino;
- xfs_ino_t ino;
- xfs_sb_t *sbp = &mp->m_sb;
int error = -ENOMEM;
/*
@@ -220,22 +227,7 @@ xfs_initialize_perag(
radix_tree_preload_end();
}
- /*
- * If we mount with the inode64 option, or no inode overflows
- * the legacy 32-bit address space clear the inode32 option.
- */
- agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
- ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
-
- if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
- mp->m_flags |= XFS_MOUNT_32BITINODES;
- else
- mp->m_flags &= ~XFS_MOUNT_32BITINODES;
-
- if (mp->m_flags & XFS_MOUNT_32BITINODES)
- index = xfs_set_inode32(mp, agcount);
- else
- index = xfs_set_inode64(mp, agcount);
+ index = xfs_set_inode_alloc(mp, agcount);
if (maxagi)
*maxagi = index;
@@ -693,10 +685,15 @@ xfs_mountfs(
if (error)
goto out;
- error = xfs_uuid_mount(mp);
+ error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
+ &mp->m_kobj, "stats");
if (error)
goto out_remove_sysfs;
+ error = xfs_uuid_mount(mp);
+ if (error)
+ goto out_del_stats;
+
/*
* Set the minimum read and write sizes
*/
@@ -850,7 +847,7 @@ xfs_mountfs(
ASSERT(rip != NULL);
- if (unlikely(!S_ISDIR(rip->i_d.di_mode))) {
+ if (unlikely(!S_ISDIR(VFS_I(rip)->i_mode))) {
xfs_warn(mp, "corrupted root inode %llu: not a directory",
(unsigned long long)rip->i_ino);
xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -971,6 +968,8 @@ xfs_mountfs(
xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
+ out_del_stats:
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
xfs_sysfs_del(&mp->m_kobj);
out:
@@ -1047,6 +1046,7 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to update superblock counters. "
"Freespace may not be correct on next mount.");
+
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1056,6 +1056,7 @@ xfs_unmountfs(
#endif
xfs_free_perag(mp);
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
@@ -1265,7 +1266,7 @@ xfs_getsb(
}
xfs_buf_hold(bp);
- ASSERT(XFS_BUF_ISDONE(bp));
+ ASSERT(bp->b_flags & XBF_DONE);
return bp;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7999e91cd49a..eafe257b357a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -127,6 +127,7 @@ typedef struct xfs_mount {
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
struct xfs_kobj m_kobj;
+ struct xstats m_stats; /* per-fs stats */
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_data_workqueue;
@@ -146,6 +147,17 @@ typedef struct xfs_mount {
* to various other kinds of pain inflicted on the pNFS server.
*/
__uint32_t m_generation;
+
+#ifdef DEBUG
+ /*
+ * DEBUG mode instrumentation to test and/or trigger delayed allocation
+ * block killing in the event of failed writes. When enabled, all
+ * buffered writes are forced to fail. All delalloc blocks in the range
+ * of the write (including pre-existing delalloc blocks!) are tossed as
+ * part of the write failure error handling sequence.
+ */
+ bool m_fail_writes;
+#endif
} xfs_mount_t;
/*
@@ -165,9 +177,8 @@ typedef struct xfs_mount {
#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
-#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above
- * 32 bits in size */
-#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */
+#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */
+#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */
#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
#define XFS_MOUNT_BARRIER (1ULL << 17)
#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/
@@ -220,12 +231,12 @@ static inline unsigned long
xfs_preferred_iosize(xfs_mount_t *mp)
{
if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
- return PAGE_CACHE_SIZE;
+ return PAGE_SIZE;
return (mp->m_swidth ?
(mp->m_swidth << mp->m_sb.sb_blocklog) :
((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
(1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) :
- PAGE_CACHE_SIZE));
+ PAGE_SIZE));
}
#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
@@ -263,6 +274,20 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
}
+#ifdef DEBUG
+static inline bool
+xfs_mp_fail_writes(struct xfs_mount *mp)
+{
+ return mp->m_fail_writes;
+}
+#else
+static inline bool
+xfs_mp_fail_writes(struct xfs_mount *mp)
+{
+ return 0;
+}
+#endif
+
/*
* Per-ag incore structure, copies of information in agf and agi, to improve the
* performance of allocation group selection.
@@ -312,6 +337,7 @@ typedef struct xfs_perag {
int pagb_count; /* pagb slots in use */
} xfs_perag_t;
+extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
@@ -325,7 +351,6 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
-extern int xfs_mount_log_sb(xfs_mount_t *);
extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
@@ -336,4 +361,7 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
extern void xfs_set_low_space_thresholds(struct xfs_mount *);
+int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb);
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
new file mode 100644
index 000000000000..184c44effdd5
--- /dev/null
+++ b/fs/xfs/xfs_ondisk.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016 Oracle.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_ONDISK_H
+#define __XFS_ONDISK_H
+
+#define XFS_CHECK_STRUCT_SIZE(structname, size) \
+ BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
+ #structname ") is wrong, expected " #size)
+
+static inline void __init
+xfs_check_ondisk_structs(void)
+{
+ /* ag/file structures */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_acl, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_acl_entry, 12);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agf, 224);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agfl, 36);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_agi, 336);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dqblk, 136);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 264);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_alloc_key_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4);
+
+ /* dir/attr trees */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_da3_node_hdr, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_blk_hdr, 48);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_data_hdr, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_free_hdr, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dir3_leaf_hdr, 64);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_entry_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_hdr_t, 32);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4);
+
+ /*
+ * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to
+ * 4 bytes anyway so it's not obviously a problem. Hence for the moment
+ * we don't check this structure. This can be re-instated when the attr
+ * definitions are updated to use c99 VLA definitions.
+ *
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12);
+ */
+
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
+ XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
+ XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
+ XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2);
+
+ /* log structures */
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_32, 28);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_efi_log_format_64, 32);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_extent_32, 12);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_ictimestamp, 8);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_64, 56);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+}
+
+#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index ab4a6066f7ca..51ddaf2c2b8c 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -42,11 +42,11 @@ xfs_break_layouts(
while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
xfs_iunlock(ip, *iolock);
if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
error = break_layout(inode, true);
*iolock = XFS_IOLOCK_EXCL;
if (with_imutex)
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
xfs_ilock(ip, *iolock);
}
@@ -181,6 +181,11 @@ xfs_fs_map_blocks(
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+ /*
+ * xfs_iomap_write_direct() expects to take ownership of
+ * the shared ilock.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
error = xfs_iomap_write_direct(ip, offset, length,
&imap, nimaps);
if (error)
@@ -288,8 +293,8 @@ xfs_fs_commit_blocks(
* Make sure reads through the pagecache see the new data.
*/
error = invalidate_inode_pages2_range(inode->i_mapping,
- start >> PAGE_CACHE_SHIFT,
- (end - 1) >> PAGE_CACHE_SHIFT);
+ start >> PAGE_SHIFT,
+ (end - 1) >> PAGE_SHIFT);
WARN_ON_ONCE(error);
error = xfs_iomap_write_unwritten(ip, start, length);
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index 8147ac108820..93f74853961b 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,7 +1,7 @@
#ifndef _XFS_PNFS_H
#define _XFS_PNFS_H 1
-#ifdef CONFIG_NFSD_PNFS
+#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
struct iomap *iomap, bool write, u32 *device_generation);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index eac9549efd52..be125e1758c1 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -184,7 +184,7 @@ xfs_qm_dqpurge(
*/
ASSERT(!list_empty(&dqp->q_lru));
list_lru_del(&qi->qi_lru, &dqp->q_lru);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(mp, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
return 0;
@@ -448,11 +448,11 @@ xfs_qm_dquot_isolate(
*/
if (dqp->q_nrefs) {
xfs_dqunlock(dqp);
- XFS_STATS_INC(xs_qm_dqwants);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
trace_xfs_dqreclaim_want(dqp);
list_lru_isolate(lru, &dqp->q_lru);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
return LRU_REMOVED;
}
@@ -496,19 +496,19 @@ xfs_qm_dquot_isolate(
ASSERT(dqp->q_nrefs == 0);
list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
trace_xfs_dqreclaim_done(dqp);
- XFS_STATS_INC(xs_qm_dqreclaims);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
return LRU_REMOVED;
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
return LRU_SKIP;
out_unlock_dirty:
trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
xfs_dqunlock(dqp);
spin_lock(lru_lock);
return LRU_RETRY;
@@ -525,7 +525,7 @@ xfs_qm_shrink_scan(
unsigned long freed;
int error;
- if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
return 0;
INIT_LIST_HEAD(&isol.buffers);
@@ -560,6 +560,37 @@ xfs_qm_shrink_count(
return list_lru_shrink_count(&qi->qi_lru, sc);
}
+STATIC void
+xfs_qm_set_defquota(
+ xfs_mount_t *mp,
+ uint type,
+ xfs_quotainfo_t *qinf)
+{
+ xfs_dquot_t *dqp;
+ struct xfs_def_quota *defq;
+ int error;
+
+ error = xfs_qm_dqread(mp, 0, type, XFS_QMOPT_DOWARN, &dqp);
+
+ if (!error) {
+ xfs_disk_dquot_t *ddqp = &dqp->q_core;
+
+ defq = xfs_get_defquota(dqp, qinf);
+
+ /*
+ * Timers and warnings have been already set, let's just set the
+ * default limits for this quota type
+ */
+ defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+ defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+ defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+ defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+ defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+ defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+ xfs_qm_dqdestroy(dqp);
+ }
+}
+
/*
* This initializes all the quota information that's kept in the
* mount structure
@@ -606,19 +637,19 @@ xfs_qm_init_quotainfo(
* We try to get the limits from the superuser's limits fields.
* This is quite hacky, but it is standard quota practice.
*
- * We look at the USR dquot with id == 0 first, but if user quotas
- * are not enabled we goto the GRP dquot with id == 0.
- * We don't really care to keep separate default limits for user
- * and group quotas, at least not at this point.
- *
* Since we may not have done a quotacheck by this point, just read
* the dquot without attaching it to any hashtables or lists.
+ *
+ * Timers and warnings are globally set by the first timer found in
+ * user/group/proj quota types, otherwise a default value is used.
+ * This should be split into different fields per quota type.
*/
error = xfs_qm_dqread(mp, 0,
XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
(XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
XFS_DQ_PROJ),
XFS_QMOPT_DOWARN, &dqp);
+
if (!error) {
xfs_disk_dquot_t *ddqp = &dqp->q_core;
@@ -639,13 +670,6 @@ xfs_qm_init_quotainfo(
be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
- qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
- qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
- qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
- qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
- qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
- qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
-
xfs_qm_dqdestroy(dqp);
} else {
qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -656,6 +680,13 @@ xfs_qm_init_quotainfo(
qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
}
+ if (XFS_IS_UQUOTA_RUNNING(mp))
+ xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);
+ if (XFS_IS_GQUOTA_RUNNING(mp))
+ xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf);
+ if (XFS_IS_PQUOTA_RUNNING(mp))
+ xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf);
+
qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 996a04064894..2975a822e9f0 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -53,6 +53,15 @@ extern struct kmem_zone *xfs_qm_dqtrxzone;
*/
#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
+struct xfs_def_quota {
+ xfs_qcnt_t bhardlimit; /* default data blk hard limit */
+ xfs_qcnt_t bsoftlimit; /* default data blk soft limit */
+ xfs_qcnt_t ihardlimit; /* default inode count hard limit */
+ xfs_qcnt_t isoftlimit; /* default inode count soft limit */
+ xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */
+ xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */
+};
+
/*
* Various quota information for individual filesystems.
* The mount structure keeps a pointer to this.
@@ -76,12 +85,9 @@ typedef struct xfs_quotainfo {
struct mutex qi_quotaofflock;/* to serialize quotaoff */
xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
uint qi_dqperchunk; /* # ondisk dqs in above chunk */
- xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */
- xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */
- xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */
- xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */
- xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */
- xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */
+ struct xfs_def_quota qi_usr_default;
+ struct xfs_def_quota qi_grp_default;
+ struct xfs_def_quota qi_prj_default;
struct shrinker qi_shrinker;
} xfs_quotainfo_t;
@@ -104,15 +110,15 @@ xfs_dquot_tree(
}
static inline struct xfs_inode *
-xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
+xfs_quota_inode(xfs_mount_t *mp, uint dq_flags)
{
- switch (dqp->dq_flags & XFS_DQ_ALLTYPES) {
+ switch (dq_flags & XFS_DQ_ALLTYPES) {
case XFS_DQ_USER:
- return dqp->q_mount->m_quotainfo->qi_uquotaip;
+ return mp->m_quotainfo->qi_uquotaip;
case XFS_DQ_GROUP:
- return dqp->q_mount->m_quotainfo->qi_gquotaip;
+ return mp->m_quotainfo->qi_gquotaip;
case XFS_DQ_PROJ:
- return dqp->q_mount->m_quotainfo->qi_pquotaip;
+ return mp->m_quotainfo->qi_pquotaip;
default:
ASSERT(0);
}
@@ -164,11 +170,27 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */
extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
- uint, struct qc_dqblk *);
+extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *,
+ uint, struct qc_dqblk *, uint);
extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
struct qc_dqblk *);
extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
+static inline struct xfs_def_quota *
+xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi)
+{
+ struct xfs_def_quota *defq;
+
+ if (XFS_QM_ISUDQ(dqp))
+ defq = &qi->qi_usr_default;
+ else if (XFS_QM_ISGDQ(dqp))
+ defq = &qi->qi_grp_default;
+ else {
+ ASSERT(XFS_QM_ISPDQ(dqp));
+ defq = &qi->qi_prj_default;
+ }
+ return defq;
+}
+
#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 3640c6e896af..f4d0e0a8f517 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -404,6 +404,7 @@ xfs_qm_scall_setqlim(
struct xfs_disk_dquot *ddq;
struct xfs_dquot *dqp;
struct xfs_trans *tp;
+ struct xfs_def_quota *defq;
int error;
xfs_qcnt_t hard, soft;
@@ -431,6 +432,8 @@ xfs_qm_scall_setqlim(
ASSERT(error != -ENOENT);
goto out_unlock;
}
+
+ defq = xfs_get_defquota(dqp, q);
xfs_dqunlock(dqp);
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
@@ -458,8 +461,8 @@ xfs_qm_scall_setqlim(
ddq->d_blk_softlimit = cpu_to_be64(soft);
xfs_dquot_set_prealloc_limits(dqp);
if (id == 0) {
- q->qi_bhardlimit = hard;
- q->qi_bsoftlimit = soft;
+ defq->bhardlimit = hard;
+ defq->bsoftlimit = soft;
}
} else {
xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
@@ -474,8 +477,8 @@ xfs_qm_scall_setqlim(
ddq->d_rtb_hardlimit = cpu_to_be64(hard);
ddq->d_rtb_softlimit = cpu_to_be64(soft);
if (id == 0) {
- q->qi_rtbhardlimit = hard;
- q->qi_rtbsoftlimit = soft;
+ defq->rtbhardlimit = hard;
+ defq->rtbsoftlimit = soft;
}
} else {
xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
@@ -491,8 +494,8 @@ xfs_qm_scall_setqlim(
ddq->d_ino_hardlimit = cpu_to_be64(hard);
ddq->d_ino_softlimit = cpu_to_be64(soft);
if (id == 0) {
- q->qi_ihardlimit = hard;
- q->qi_isoftlimit = soft;
+ defq->ihardlimit = hard;
+ defq->isoftlimit = soft;
}
} else {
xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
@@ -635,9 +638,10 @@ out:
int
xfs_qm_scall_getquota(
struct xfs_mount *mp,
- xfs_dqid_t id,
+ xfs_dqid_t *id,
uint type,
- struct qc_dqblk *dst)
+ struct qc_dqblk *dst,
+ uint dqget_flags)
{
struct xfs_dquot *dqp;
int error;
@@ -647,7 +651,7 @@ xfs_qm_scall_getquota(
* we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
* exist, we'll get ENOENT back.
*/
- error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+ error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp);
if (error)
return error;
@@ -660,6 +664,9 @@ xfs_qm_scall_getquota(
goto out_put;
}
+ /* Fill in the ID we actually read from disk */
+ *id = be32_to_cpu(dqp->q_core.d_id);
+
memset(dst, 0, sizeof(*dst));
dst->d_spc_hardlimit =
XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
@@ -701,7 +708,7 @@ xfs_qm_scall_getquota(
if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
(XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
(XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
- id != 0) {
+ *id != 0) {
if ((dst->d_space > dst->d_spc_softlimit) &&
(dst->d_spc_softlimit > 0)) {
ASSERT(dst->d_spc_timer != 0);
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7795e0d01382..f82d79a8c694 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -231,14 +231,45 @@ xfs_fs_get_dqblk(
struct qc_dqblk *qdq)
{
struct xfs_mount *mp = XFS_M(sb);
+ xfs_dqid_t id;
if (!XFS_IS_QUOTA_RUNNING(mp))
return -ENOSYS;
if (!XFS_IS_QUOTA_ON(mp))
return -ESRCH;
- return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
- xfs_quota_type(qid.type), qdq);
+ id = from_kqid(&init_user_ns, qid);
+ return xfs_qm_scall_getquota(mp, &id,
+ xfs_quota_type(qid.type), qdq, 0);
+}
+
+/* Return quota info for active quota >= this qid */
+STATIC int
+xfs_fs_get_nextdqblk(
+ struct super_block *sb,
+ struct kqid *qid,
+ struct qc_dqblk *qdq)
+{
+ int ret;
+ struct xfs_mount *mp = XFS_M(sb);
+ xfs_dqid_t id;
+
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ if (!XFS_IS_QUOTA_ON(mp))
+ return -ESRCH;
+
+ id = from_kqid(&init_user_ns, *qid);
+ ret = xfs_qm_scall_getquota(mp, &id,
+ xfs_quota_type(qid->type), qdq,
+ XFS_QMOPT_DQNEXT);
+ if (ret)
+ return ret;
+
+ /* ID may be different, so convert back what we got */
+ *qid = make_kqid(current_user_ns(), qid->type, id);
+ return 0;
+
}
STATIC int
@@ -267,5 +298,6 @@ const struct quotactl_ops xfs_quotactl_operations = {
.quota_disable = xfs_quota_disable,
.rm_xquota = xfs_fs_rm_xquota,
.get_dqblk = xfs_fs_get_dqblk,
+ .get_nextdqblk = xfs_fs_get_nextdqblk,
.set_dqblk = xfs_fs_set_dqblk,
};
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ab1bac6a3a1c..abf44435d04a 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -766,7 +766,6 @@ xfs_growfs_rt_alloc(
{
xfs_fileoff_t bno; /* block number in file */
struct xfs_buf *bp; /* temporary buffer for zeroing */
- int committed; /* transaction committed flag */
xfs_daddr_t d; /* disk block address */
int error; /* error return value */
xfs_fsblock_t firstblock;/* first block allocated in xaction */
@@ -811,7 +810,7 @@ xfs_growfs_rt_alloc(
/*
* Free any blocks freed up in the transaction, then commit.
*/
- error = xfs_bmap_finish(&tp, &flist, &committed);
+ error = xfs_bmap_finish(&tp, &flist, NULL);
if (error)
goto out_bmap_cancel;
error = xfs_trans_commit(tp);
@@ -1273,7 +1272,7 @@ xfs_rtpick_extent(
ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
- seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
+ seqp = (__uint64_t *)&VFS_I(mp->m_rbmip)->i_atime;
if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
*seqp = 0;
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index f2240383d4bb..8686df6c7609 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -18,20 +18,21 @@
#include "xfs.h"
#include <linux/proc_fs.h>
-DEFINE_PER_CPU(struct xfsstats, xfsstats);
+struct xstats xfsstats;
-static int counter_val(int idx)
+static int counter_val(struct xfsstats __percpu *stats, int idx)
{
int val = 0, cpu;
for_each_possible_cpu(cpu)
- val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+ val += *(((__u32 *)per_cpu_ptr(stats, cpu) + idx));
return val;
}
-static int xfs_stat_proc_show(struct seq_file *m, void *v)
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{
int i, j;
+ int len = 0;
__uint64_t xs_xstrat_bytes = 0;
__uint64_t xs_write_bytes = 0;
__uint64_t xs_read_bytes = 0;
@@ -65,54 +66,59 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
};
/* Loop over all stats groups */
+
for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
- seq_printf(m, "%s", xstats[i].desc);
+ len += snprintf(buf + len, PATH_MAX - len, "%s",
+ xstats[i].desc);
/* inner loop does each group */
for (; j < xstats[i].endpoint; j++)
- seq_printf(m, " %u", counter_val(j));
- seq_putc(m, '\n');
+ len += snprintf(buf + len, PATH_MAX - len, " %u",
+ counter_val(stats, j));
+ len += snprintf(buf + len, PATH_MAX - len, "\n");
}
/* extra precision counters */
for_each_possible_cpu(i) {
- xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
- xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
- xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+ xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes;
+ xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes;
+ xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes;
}
- seq_printf(m, "xpc %Lu %Lu %Lu\n",
+ len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- seq_printf(m, "debug %u\n",
+ len += snprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
#else
0);
#endif
- return 0;
+
+ return len;
}
-static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+void xfs_stats_clearall(struct xfsstats __percpu *stats)
{
- return single_open(file, xfs_stat_proc_show, NULL);
+ int c;
+ __uint32_t vn_active;
+
+ xfs_notice(NULL, "Clearing xfsstats");
+ for_each_possible_cpu(c) {
+ preempt_disable();
+ /* save vn_active, it's a universal truth! */
+ vn_active = per_cpu_ptr(stats, c)->vn_active;
+ memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
+ per_cpu_ptr(stats, c)->vn_active = vn_active;
+ preempt_enable();
+ }
}
-static const struct file_operations xfs_stat_proc_fops = {
- .owner = THIS_MODULE,
- .open = xfs_stat_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* legacy quota interfaces */
#ifdef CONFIG_XFS_QUOTA
static int xqm_proc_show(struct seq_file *m, void *v)
{
/* maximum; incore; ratio free to inuse; freelist */
seq_printf(m, "%d\t%d\t%d\t%u\n",
- 0,
- counter_val(XFSSTAT_END_XQMSTAT),
- 0,
- counter_val(XFSSTAT_END_XQMSTAT + 1));
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT),
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1));
return 0;
}
@@ -136,7 +142,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
seq_printf(m, "qm");
for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
- seq_printf(m, " %u", counter_val(j));
+ seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
return 0;
}
@@ -155,44 +161,35 @@ static const struct file_operations xqmstat_proc_fops = {
};
#endif /* CONFIG_XFS_QUOTA */
+#ifdef CONFIG_PROC_FS
int
xfs_init_procfs(void)
{
if (!proc_mkdir("fs/xfs", NULL))
+ return -ENOMEM;
+
+ if (!proc_symlink("fs/xfs/stat", NULL,
+ "/sys/fs/xfs/stats/stats"))
goto out;
- if (!proc_create("fs/xfs/stat", 0, NULL,
- &xfs_stat_proc_fops))
- goto out_remove_xfs_dir;
#ifdef CONFIG_XFS_QUOTA
if (!proc_create("fs/xfs/xqmstat", 0, NULL,
&xqmstat_proc_fops))
- goto out_remove_stat_file;
+ goto out;
if (!proc_create("fs/xfs/xqm", 0, NULL,
&xqm_proc_fops))
- goto out_remove_xqmstat_file;
+ goto out;
#endif
return 0;
-#ifdef CONFIG_XFS_QUOTA
- out_remove_xqmstat_file:
- remove_proc_entry("fs/xfs/xqmstat", NULL);
- out_remove_stat_file:
- remove_proc_entry("fs/xfs/stat", NULL);
-#endif
- out_remove_xfs_dir:
- remove_proc_entry("fs/xfs", NULL);
- out:
+out:
+ remove_proc_subtree("fs/xfs", NULL);
return -ENOMEM;
}
void
xfs_cleanup_procfs(void)
{
-#ifdef CONFIG_XFS_QUOTA
- remove_proc_entry("fs/xfs/xqm", NULL);
- remove_proc_entry("fs/xfs/xqmstat", NULL);
-#endif
- remove_proc_entry("fs/xfs/stat", NULL);
- remove_proc_entry("fs/xfs", NULL);
+ remove_proc_subtree("fs/xfs", NULL);
}
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index c8f238b8299a..483b0eff1988 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -19,8 +19,6 @@
#define __XFS_STATS_H__
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
#include <linux/percpu.h>
/*
@@ -215,15 +213,29 @@ struct xfsstats {
__uint64_t xs_read_bytes;
};
-DECLARE_PER_CPU(struct xfsstats, xfsstats);
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
+void xfs_stats_clearall(struct xfsstats __percpu *stats);
+extern struct xstats xfsstats;
-/*
- * We don't disable preempt, not too worried about poking the
- * wrong CPU's stat for now (also aggregated before reporting).
- */
-#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++)
-#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--)
-#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc))
+#define XFS_STATS_INC(mp, v) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \
+} while (0)
+
+#define XFS_STATS_DEC(mp, v) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \
+} while (0)
+
+#define XFS_STATS_ADD(mp, v, inc) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \
+} while (0)
+
+#if defined(CONFIG_PROC_FS)
extern int xfs_init_procfs(void);
extern void xfs_cleanup_procfs(void);
@@ -231,10 +243,6 @@ extern void xfs_cleanup_procfs(void);
#else /* !CONFIG_PROC_FS */
-# define XFS_STATS_INC(count)
-# define XFS_STATS_DEC(count)
-# define XFS_STATS_ADD(count, inc)
-
static inline int xfs_init_procfs(void)
{
return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 904f637cfa5f..187e14b696c2 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -45,6 +45,7 @@
#include "xfs_filestream.h"
#include "xfs_quota.h"
#include "xfs_sysfs.h"
+#include "xfs_ondisk.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -65,83 +66,85 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */
static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#endif
-#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */
-#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */
-#define MNTOPT_LOGDEV "logdev" /* log device */
-#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */
-#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */
-#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */
-#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */
-#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */
-#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */
-#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */
-#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */
-#define MNTOPT_MTPT "mtpt" /* filesystem mount point */
-#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */
-#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */
-#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */
-#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */
-#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */
-#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */
-#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and
- * unwritten extent conversion */
-#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */
-#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */
-#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to
- * XFS_MAXINUMBER_32 */
-#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */
-#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */
-#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */
-#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes
- * in stat(). */
-#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */
-#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */
-#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */
-#define MNTOPT_QUOTA "quota" /* disk quotas (user) */
-#define MNTOPT_NOQUOTA "noquota" /* no quotas */
-#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */
-#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */
-#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */
-#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */
-#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */
-#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */
-#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
-#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
-#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
-#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
-#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
-
-#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */
-
/*
* Table driven mount option parser.
- *
- * Currently only used for remount, but it will be used for mount
- * in the future, too.
*/
enum {
- Opt_barrier,
- Opt_nobarrier,
- Opt_inode64,
- Opt_inode32,
- Opt_err
+ Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize,
+ Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
+ Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
+ Opt_allocsize, Opt_norecovery, Opt_barrier, Opt_nobarrier,
+ Opt_inode64, Opt_inode32, Opt_ikeep, Opt_noikeep,
+ Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, Opt_filestreams,
+ Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota,
+ Opt_uquota, Opt_gquota, Opt_pquota,
+ Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
+ Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
};
static const match_table_t tokens = {
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_inode64, "inode64"},
- {Opt_inode32, "inode32"},
- {Opt_err, NULL}
+ {Opt_logbufs, "logbufs=%u"}, /* number of XFS log buffers */
+ {Opt_logbsize, "logbsize=%s"}, /* size of XFS log buffers */
+ {Opt_logdev, "logdev=%s"}, /* log device */
+ {Opt_rtdev, "rtdev=%s"}, /* realtime I/O device */
+ {Opt_biosize, "biosize=%u"}, /* log2 of preferred buffered io size */
+ {Opt_wsync, "wsync"}, /* safe-mode nfs compatible mount */
+ {Opt_noalign, "noalign"}, /* turn off stripe alignment */
+ {Opt_swalloc, "swalloc"}, /* turn on stripe width allocation */
+ {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */
+ {Opt_swidth, "swidth=%u"}, /* data volume stripe width */
+ {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */
+ {Opt_mtpt, "mtpt"}, /* filesystem mount point */
+ {Opt_grpid, "grpid"}, /* group-ID from parent directory */
+ {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */
+ {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */
+ {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */
+ {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */
+ {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */
+ {Opt_barrier, "barrier"}, /* use writer barriers for log write and
+ * unwritten extent conversion */
+ {Opt_nobarrier, "nobarrier"}, /* .. disable */
+ {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */
+ {Opt_inode32, "inode32"}, /* inode allocation limited to
+ * XFS_MAXINUMBER_32 */
+ {Opt_ikeep, "ikeep"}, /* do not free empty inode clusters */
+ {Opt_noikeep, "noikeep"}, /* free empty inode clusters */
+ {Opt_largeio, "largeio"}, /* report large I/O sizes in stat() */
+ {Opt_nolargeio, "nolargeio"}, /* do not report large I/O sizes
+ * in stat(). */
+ {Opt_attr2, "attr2"}, /* do use attr2 attribute format */
+ {Opt_noattr2, "noattr2"}, /* do not use attr2 attribute format */
+ {Opt_filestreams,"filestreams"},/* use filestreams allocator */
+ {Opt_quota, "quota"}, /* disk quotas (user) */
+ {Opt_noquota, "noquota"}, /* no quotas */
+ {Opt_usrquota, "usrquota"}, /* user quota enabled */
+ {Opt_grpquota, "grpquota"}, /* group quota enabled */
+ {Opt_prjquota, "prjquota"}, /* project quota enabled */
+ {Opt_uquota, "uquota"}, /* user quota (IRIX variant) */
+ {Opt_gquota, "gquota"}, /* group quota (IRIX variant) */
+ {Opt_pquota, "pquota"}, /* project quota (IRIX variant) */
+ {Opt_uqnoenforce,"uqnoenforce"},/* user quota limit enforcement */
+ {Opt_gqnoenforce,"gqnoenforce"},/* group quota limit enforcement */
+ {Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
+ {Opt_qnoenforce, "qnoenforce"}, /* same as uqnoenforce */
+ {Opt_discard, "discard"}, /* Discard unused blocks */
+ {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */
+
+ {Opt_dax, "dax"}, /* Enable direct access to bdev pages */
+ {Opt_err, NULL},
};
-STATIC unsigned long
-suffix_kstrtoint(char *s, unsigned int base, int *res)
+STATIC int
+suffix_kstrtoint(const substring_t *s, unsigned int base, int *res)
{
int last, shift_left_factor = 0, _res;
- char *value = s;
+ char *value;
+ int ret = 0;
+
+ value = match_strdup(s);
+ if (!value)
+ return -ENOMEM;
last = strlen(value) - 1;
if (value[last] == 'K' || value[last] == 'k') {
@@ -157,10 +160,11 @@ suffix_kstrtoint(char *s, unsigned int base, int *res)
value[last] = '\0';
}
- if (kstrtoint(s, base, &_res))
- return -EINVAL;
+ if (kstrtoint(value, base, &_res))
+ ret = -EINVAL;
+ kfree(value);
*res = _res << shift_left_factor;
- return 0;
+ return ret;
}
/*
@@ -169,14 +173,19 @@ suffix_kstrtoint(char *s, unsigned int base, int *res)
*
* Note that this function leaks the various device name allocations on
* failure. The caller takes care of them.
+ *
+ * *sb is const because this is also used to test options on the remount
+ * path, and we don't want this to have any side effects at remount time.
+ * Today this function does not change *sb, but just to future-proof...
*/
STATIC int
xfs_parseargs(
struct xfs_mount *mp,
char *options)
{
- struct super_block *sb = mp->m_super;
- char *this_char, *value;
+ const struct super_block *sb = mp->m_super;
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
int dsunit = 0;
int dswidth = 0;
int iosize = 0;
@@ -217,152 +226,152 @@ xfs_parseargs(
if (!options)
goto done;
- while ((this_char = strsep(&options, ",")) != NULL) {
- if (!*this_char)
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
continue;
- if ((value = strchr(this_char, '=')) != NULL)
- *value++ = 0;
- if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (kstrtoint(value, 10, &mp->m_logbufs))
- return -EINVAL;
- } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_logbufs:
+ if (match_int(args, &mp->m_logbufs))
return -EINVAL;
- } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
+ break;
+ case Opt_logbsize:
+ if (suffix_kstrtoint(args, 10, &mp->m_logbsize))
return -EINVAL;
- }
- mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+ break;
+ case Opt_logdev:
+ mp->m_logname = match_strdup(args);
if (!mp->m_logname)
return -ENOMEM;
- } else if (!strcmp(this_char, MNTOPT_MTPT)) {
- xfs_warn(mp, "%s option not allowed on this system",
- this_char);
+ break;
+ case Opt_mtpt:
+ xfs_warn(mp, "%s option not allowed on this system", p);
return -EINVAL;
- } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+ case Opt_rtdev:
+ mp->m_rtname = match_strdup(args);
if (!mp->m_rtname)
return -ENOMEM;
- } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) ||
- !strcmp(this_char, MNTOPT_BIOSIZE)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (suffix_kstrtoint(value, 10, &iosize))
+ break;
+ case Opt_allocsize:
+ case Opt_biosize:
+ if (suffix_kstrtoint(args, 10, &iosize))
return -EINVAL;
iosizelog = ffs(iosize) - 1;
- } else if (!strcmp(this_char, MNTOPT_GRPID) ||
- !strcmp(this_char, MNTOPT_BSDGROUPS)) {
+ break;
+ case Opt_grpid:
+ case Opt_bsdgroups:
mp->m_flags |= XFS_MOUNT_GRPID;
- } else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
- !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
+ break;
+ case Opt_nogrpid:
+ case Opt_sysvgroups:
mp->m_flags &= ~XFS_MOUNT_GRPID;
- } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+ break;
+ case Opt_wsync:
mp->m_flags |= XFS_MOUNT_WSYNC;
- } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+ break;
+ case Opt_norecovery:
mp->m_flags |= XFS_MOUNT_NORECOVERY;
- } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+ break;
+ case Opt_noalign:
mp->m_flags |= XFS_MOUNT_NOALIGN;
- } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
+ break;
+ case Opt_swalloc:
mp->m_flags |= XFS_MOUNT_SWALLOC;
- } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (kstrtoint(value, 10, &dsunit))
- return -EINVAL;
- } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
+ break;
+ case Opt_sunit:
+ if (match_int(args, &dsunit))
return -EINVAL;
- }
- if (kstrtoint(value, 10, &dswidth))
+ break;
+ case Opt_swidth:
+ if (match_int(args, &dswidth))
return -EINVAL;
- } else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
+ break;
+ case Opt_inode32:
mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
+ break;
+ case Opt_inode64:
mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
- } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+ break;
+ case Opt_nouuid:
mp->m_flags |= XFS_MOUNT_NOUUID;
- } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+ break;
+ case Opt_barrier:
mp->m_flags |= XFS_MOUNT_BARRIER;
- } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
+ break;
+ case Opt_nobarrier:
mp->m_flags &= ~XFS_MOUNT_BARRIER;
- } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
+ break;
+ case Opt_ikeep:
mp->m_flags |= XFS_MOUNT_IKEEP;
- } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
+ break;
+ case Opt_noikeep:
mp->m_flags &= ~XFS_MOUNT_IKEEP;
- } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
+ break;
+ case Opt_largeio:
mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
- } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
+ break;
+ case Opt_nolargeio:
mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
- } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
+ break;
+ case Opt_attr2:
mp->m_flags |= XFS_MOUNT_ATTR2;
- } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
+ break;
+ case Opt_noattr2:
mp->m_flags &= ~XFS_MOUNT_ATTR2;
mp->m_flags |= XFS_MOUNT_NOATTR2;
- } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
+ break;
+ case Opt_filestreams:
mp->m_flags |= XFS_MOUNT_FILESTREAMS;
- } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
+ break;
+ case Opt_noquota:
mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
- } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
- !strcmp(this_char, MNTOPT_UQUOTA) ||
- !strcmp(this_char, MNTOPT_USRQUOTA)) {
+ break;
+ case Opt_quota:
+ case Opt_uquota:
+ case Opt_usrquota:
mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
XFS_UQUOTA_ENFD);
- } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
- !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+ break;
+ case Opt_qnoenforce:
+ case Opt_uqnoenforce:
mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_UQUOTA_ENFD;
- } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
- !strcmp(this_char, MNTOPT_PRJQUOTA)) {
+ break;
+ case Opt_pquota:
+ case Opt_prjquota:
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
XFS_PQUOTA_ENFD);
- } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
+ break;
+ case Opt_pqnoenforce:
mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_PQUOTA_ENFD;
- } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
- !strcmp(this_char, MNTOPT_GRPQUOTA)) {
+ case Opt_gquota:
+ case Opt_grpquota:
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
XFS_GQUOTA_ENFD);
- } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+ break;
+ case Opt_gqnoenforce:
mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
mp->m_qflags &= ~XFS_GQUOTA_ENFD;
- } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+ break;
+ case Opt_discard:
mp->m_flags |= XFS_MOUNT_DISCARD;
- } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+ break;
+ case Opt_nodiscard:
mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ break;
#ifdef CONFIG_FS_DAX
- } else if (!strcmp(this_char, MNTOPT_DAX)) {
+ case Opt_dax:
mp->m_flags |= XFS_MOUNT_DAX;
+ break;
#endif
- } else {
- xfs_warn(mp, "unknown mount option [%s].", this_char);
+ default:
+ xfs_warn(mp, "unknown mount option [%s].", p);
return -EINVAL;
}
}
@@ -461,25 +470,25 @@ xfs_showargs(
{
static struct proc_xfs_info xfs_info_set[] = {
/* the few simple ones we can get from the mount struct */
- { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP },
- { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC },
- { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN },
- { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC },
- { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID },
- { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY },
- { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 },
- { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
- { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
- { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
- { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
- { XFS_MOUNT_DAX, "," MNTOPT_DAX },
+ { XFS_MOUNT_IKEEP, ",ikeep" },
+ { XFS_MOUNT_WSYNC, ",wsync" },
+ { XFS_MOUNT_NOALIGN, ",noalign" },
+ { XFS_MOUNT_SWALLOC, ",swalloc" },
+ { XFS_MOUNT_NOUUID, ",nouuid" },
+ { XFS_MOUNT_NORECOVERY, ",norecovery" },
+ { XFS_MOUNT_ATTR2, ",attr2" },
+ { XFS_MOUNT_FILESTREAMS, ",filestreams" },
+ { XFS_MOUNT_GRPID, ",grpid" },
+ { XFS_MOUNT_DISCARD, ",discard" },
+ { XFS_MOUNT_SMALL_INUMS, ",inode32" },
+ { XFS_MOUNT_DAX, ",dax" },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
/* the few simple ones we can get from the mount struct */
- { XFS_MOUNT_COMPAT_IOSIZE, "," MNTOPT_LARGEIO },
- { XFS_MOUNT_BARRIER, "," MNTOPT_NOBARRIER },
- { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_64BITINODE },
+ { XFS_MOUNT_COMPAT_IOSIZE, ",largeio" },
+ { XFS_MOUNT_BARRIER, ",nobarrier" },
+ { XFS_MOUNT_SMALL_INUMS, ",inode64" },
{ 0, NULL }
};
struct proc_xfs_info *xfs_infop;
@@ -494,46 +503,46 @@ xfs_showargs(
}
if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
- seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
+ seq_printf(m, ",allocsize=%dk",
(int)(1 << mp->m_writeio_log) >> 10);
if (mp->m_logbufs > 0)
- seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
+ seq_printf(m, ",logbufs=%d", mp->m_logbufs);
if (mp->m_logbsize > 0)
- seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
+ seq_printf(m, ",logbsize=%dk", mp->m_logbsize >> 10);
if (mp->m_logname)
- seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
+ seq_show_option(m, "logdev", mp->m_logname);
if (mp->m_rtname)
- seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
+ seq_show_option(m, "rtdev", mp->m_rtname);
if (mp->m_dalign > 0)
- seq_printf(m, "," MNTOPT_SUNIT "=%d",
+ seq_printf(m, ",sunit=%d",
(int)XFS_FSB_TO_BB(mp, mp->m_dalign));
if (mp->m_swidth > 0)
- seq_printf(m, "," MNTOPT_SWIDTH "=%d",
+ seq_printf(m, ",swidth=%d",
(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
- seq_puts(m, "," MNTOPT_USRQUOTA);
+ seq_puts(m, ",usrquota");
else if (mp->m_qflags & XFS_UQUOTA_ACCT)
- seq_puts(m, "," MNTOPT_UQUOTANOENF);
+ seq_puts(m, ",uqnoenforce");
if (mp->m_qflags & XFS_PQUOTA_ACCT) {
if (mp->m_qflags & XFS_PQUOTA_ENFD)
- seq_puts(m, "," MNTOPT_PRJQUOTA);
+ seq_puts(m, ",prjquota");
else
- seq_puts(m, "," MNTOPT_PQUOTANOENF);
+ seq_puts(m, ",pqnoenforce");
}
if (mp->m_qflags & XFS_GQUOTA_ACCT) {
if (mp->m_qflags & XFS_GQUOTA_ENFD)
- seq_puts(m, "," MNTOPT_GRPQUOTA);
+ seq_puts(m, ",grpquota");
else
- seq_puts(m, "," MNTOPT_GQUOTANOENF);
+ seq_puts(m, ",gqnoenforce");
}
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
- seq_puts(m, "," MNTOPT_NOQUOTA);
+ seq_puts(m, ",noquota");
return 0;
}
@@ -547,10 +556,10 @@ xfs_max_file_offset(
/* Figure out maximum filesize, on Linux this can depend on
* the filesystem blocksize (on 32 bit platforms).
* __block_write_begin does this in an [unsigned] long...
- * page->index << (PAGE_CACHE_SHIFT - bbits)
+ * page->index << (PAGE_SHIFT - bbits)
* So, for page sized blocks (4K on 32 bit platforms),
* this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
- * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+ * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
* but for smaller blocksizes it is less (bbits = log2 bsize).
* Note1: get_block_t takes a long (implicit cast from above)
* Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
@@ -561,10 +570,10 @@ xfs_max_file_offset(
#if BITS_PER_LONG == 32
# if defined(CONFIG_LBDAF)
ASSERT(sizeof(sector_t) == 8);
- pagefactor = PAGE_CACHE_SIZE;
+ pagefactor = PAGE_SIZE;
bitshift = BITS_PER_LONG;
# else
- pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+ pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift);
# endif
#endif
@@ -572,23 +581,35 @@ xfs_max_file_offset(
}
/*
- * xfs_set_inode32() and xfs_set_inode64() are passed an agcount
- * because in the growfs case, mp->m_sb.sb_agcount is not updated
- * yet to the potentially higher ag count.
+ * Set parameters for inode allocation heuristics, taking into account
+ * filesystem size and inode32/inode64 mount options; i.e. specifically
+ * whether or not XFS_MOUNT_SMALL_INUMS is set.
+ *
+ * Inode allocation patterns are altered only if inode32 is requested
+ * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large.
+ * If altered, XFS_MOUNT_32BITINODES is set as well.
+ *
+ * An agcount independent of that in the mount structure is provided
+ * because in the growfs case, mp->m_sb.sb_agcount is not yet updated
+ * to the potentially higher ag count.
+ *
+ * Returns the maximum AG index which may contain inodes.
*/
xfs_agnumber_t
-xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
+xfs_set_inode_alloc(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agcount)
{
- xfs_agnumber_t index = 0;
+ xfs_agnumber_t index;
xfs_agnumber_t maxagi = 0;
xfs_sb_t *sbp = &mp->m_sb;
xfs_agnumber_t max_metadata;
xfs_agino_t agino;
xfs_ino_t ino;
- xfs_perag_t *pag;
- /* Calculate how much should be reserved for inodes to meet
- * the max inode percentage.
+ /*
+ * Calculate how much should be reserved for inodes to meet
+ * the max inode percentage. Used only for inode32.
*/
if (mp->m_maxicount) {
__uint64_t icount;
@@ -602,54 +623,48 @@ xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
max_metadata = agcount;
}
+ /* Get the last possible inode in the filesystem */
agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+ ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+ /*
+ * If user asked for no more than 32-bit inodes, and the fs is
+ * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter
+ * the allocator to accommodate the request.
+ */
+ if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
+ mp->m_flags |= XFS_MOUNT_32BITINODES;
+ else
+ mp->m_flags &= ~XFS_MOUNT_32BITINODES;
for (index = 0; index < agcount; index++) {
- ino = XFS_AGINO_TO_INO(mp, index, agino);
+ struct xfs_perag *pag;
- if (ino > XFS_MAXINUMBER_32) {
- pag = xfs_perag_get(mp, index);
- pag->pagi_inodeok = 0;
- pag->pagf_metadata = 0;
- xfs_perag_put(pag);
- continue;
- }
+ ino = XFS_AGINO_TO_INO(mp, index, agino);
pag = xfs_perag_get(mp, index);
- pag->pagi_inodeok = 1;
- maxagi++;
- if (index < max_metadata)
- pag->pagf_metadata = 1;
- xfs_perag_put(pag);
- }
- mp->m_flags |= (XFS_MOUNT_32BITINODES |
- XFS_MOUNT_SMALL_INUMS);
- return maxagi;
-}
-
-xfs_agnumber_t
-xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount)
-{
- xfs_agnumber_t index = 0;
-
- for (index = 0; index < agcount; index++) {
- struct xfs_perag *pag;
+ if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+ if (ino > XFS_MAXINUMBER_32) {
+ pag->pagi_inodeok = 0;
+ pag->pagf_metadata = 0;
+ } else {
+ pag->pagi_inodeok = 1;
+ maxagi++;
+ if (index < max_metadata)
+ pag->pagf_metadata = 1;
+ else
+ pag->pagf_metadata = 0;
+ }
+ } else {
+ pag->pagi_inodeok = 1;
+ pag->pagf_metadata = 0;
+ }
- pag = xfs_perag_get(mp, index);
- pag->pagi_inodeok = 1;
- pag->pagf_metadata = 0;
xfs_perag_put(pag);
}
- /* There is no need for lock protection on m_flags,
- * the rw_semaphore of the VFS superblock is locked
- * during mount/umount/remount operations, so this is
- * enough to avoid concurency on the m_flags field
- */
- mp->m_flags &= ~(XFS_MOUNT_32BITINODES |
- XFS_MOUNT_SMALL_INUMS);
- return index;
+ return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount;
}
STATIC int
@@ -838,17 +853,18 @@ xfs_init_mount_workqueues(
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
+ mp->m_fsname);
if (!mp->m_log_workqueue)
goto out_destroy_reclaim;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
@@ -922,7 +938,7 @@ xfs_fs_destroy_inode(
trace_xfs_destroy_inode(ip);
- XFS_STATS_INC(vn_reclaim);
+ XFS_STATS_INC(ip->i_mount, vn_reclaim);
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -983,8 +999,8 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- XFS_STATS_INC(vn_rele);
- XFS_STATS_INC(vn_remove);
+ XFS_STATS_INC(ip->i_mount, vn_rele);
+ XFS_STATS_INC(ip->i_mount, vn_remove);
xfs_inactive(ip);
}
@@ -1165,6 +1181,27 @@ xfs_quiesce_attr(
}
STATIC int
+xfs_test_remount_options(
+ struct super_block *sb,
+ struct xfs_mount *mp,
+ char *options)
+{
+ int error = 0;
+ struct xfs_mount *tmp_mp;
+
+ tmp_mp = kmem_zalloc(sizeof(*tmp_mp), KM_MAYFAIL);
+ if (!tmp_mp)
+ return -ENOMEM;
+
+ tmp_mp->m_super = sb;
+ error = xfs_parseargs(tmp_mp, options);
+ xfs_free_fsname(tmp_mp);
+ kfree(tmp_mp);
+
+ return error;
+}
+
+STATIC int
xfs_fs_remount(
struct super_block *sb,
int *flags,
@@ -1176,6 +1213,11 @@ xfs_fs_remount(
char *p;
int error;
+ /* First, check for complete junk; i.e. invalid options */
+ error = xfs_test_remount_options(sb, mp, options);
+ if (error)
+ return error;
+
sync_filesystem(sb);
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -1192,10 +1234,12 @@ xfs_fs_remount(
mp->m_flags &= ~XFS_MOUNT_BARRIER;
break;
case Opt_inode64:
- mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount);
+ mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
break;
case Opt_inode32:
- mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount);
+ mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+ mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount);
break;
default:
/*
@@ -1343,9 +1387,8 @@ xfs_finish_flags(
*/
if (xfs_sb_version_hascrc(&mp->m_sb) &&
(mp->m_flags & XFS_MOUNT_NOATTR2)) {
- xfs_warn(mp,
-"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
- MNTOPT_NOATTR2, MNTOPT_ATTR2);
+ xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
+ "attr2 is always enabled for V5 filesystems.");
return -EINVAL;
}
@@ -1474,9 +1517,16 @@ xfs_fs_fill_super(
if (error)
goto out_destroy_workqueues;
+ /* Allocate stats memory before we do operations that might use it */
+ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
+ if (!mp->m_stats.xs_stats) {
+ error = -ENOMEM;
+ goto out_destroy_counters;
+ }
+
error = xfs_readsb(mp, flags);
if (error)
- goto out_destroy_counters;
+ goto out_free_stats;
error = xfs_finish_flags(mp);
if (error)
@@ -1545,9 +1595,11 @@ xfs_fs_fill_super(
xfs_filestream_unmount(mp);
out_free_sb:
xfs_freesb(mp);
+ out_free_stats:
+ free_percpu(mp->m_stats.xs_stats);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
-out_destroy_workqueues:
+ out_destroy_workqueues:
xfs_destroy_mount_workqueues(mp);
out_close_devices:
xfs_close_devices(mp);
@@ -1574,6 +1626,7 @@ xfs_fs_put_super(
xfs_unmountfs(mp);
xfs_freesb(mp);
+ free_percpu(mp->m_stats.xs_stats);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
@@ -1703,8 +1756,8 @@ xfs_init_zones(void)
xfs_inode_zone =
kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
- xfs_fs_inode_init_once);
+ KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
+ KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
if (!xfs_inode_zone)
goto out_destroy_efi_zone;
@@ -1806,6 +1859,8 @@ init_xfs_fs(void)
{
int error;
+ xfs_check_ondisk_structs();
+
printk(KERN_INFO XFS_VERSION_STRING " with "
XFS_BUILD_OPTIONS " enabled\n");
@@ -1838,19 +1893,32 @@ init_xfs_fs(void)
xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
if (!xfs_kset) {
error = -ENOMEM;
- goto out_sysctl_unregister;;
+ goto out_sysctl_unregister;
}
+ xfsstats.xs_kobj.kobject.kset = xfs_kset;
+
+ xfsstats.xs_stats = alloc_percpu(struct xfsstats);
+ if (!xfsstats.xs_stats) {
+ error = -ENOMEM;
+ goto out_kset_unregister;
+ }
+
+ error = xfs_sysfs_init(&xfsstats.xs_kobj, &xfs_stats_ktype, NULL,
+ "stats");
+ if (error)
+ goto out_free_stats;
+
#ifdef DEBUG
xfs_dbg_kobj.kobject.kset = xfs_kset;
error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
if (error)
- goto out_kset_unregister;
+ goto out_remove_stats_kobj;
#endif
error = xfs_qm_init();
if (error)
- goto out_remove_kobj;
+ goto out_remove_dbg_kobj;
error = register_filesystem(&xfs_fs_type);
if (error)
@@ -1859,11 +1927,15 @@ init_xfs_fs(void)
out_qm_exit:
xfs_qm_exit();
- out_remove_kobj:
+ out_remove_dbg_kobj:
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
- out_kset_unregister:
+ out_remove_stats_kobj:
#endif
+ xfs_sysfs_del(&xfsstats.xs_kobj);
+ out_free_stats:
+ free_percpu(xfsstats.xs_stats);
+ out_kset_unregister:
kset_unregister(xfs_kset);
out_sysctl_unregister:
xfs_sysctl_unregister();
@@ -1889,6 +1961,8 @@ exit_xfs_fs(void)
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
#endif
+ xfs_sysfs_del(&xfsstats.xs_kobj);
+ free_percpu(xfsstats.xs_stats);
kset_unregister(xfs_kset);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
@@ -1896,6 +1970,7 @@ exit_xfs_fs(void)
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_zones();
+ xfs_uuid_table_free();
}
module_init(init_xfs_fs);
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 499058fea303..2dfb1ce4585f 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -65,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
-extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
-extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
+extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
+ xfs_agnumber_t agcount);
extern const struct export_operations xfs_export_operations;
extern const struct xattr_handler *xfs_xattr_handlers[];
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 996481eeb491..b44284c1adda 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -178,7 +178,6 @@ xfs_symlink(
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
bool unlock_dp_on_error = false;
- int committed;
xfs_fileoff_t first_fsb;
xfs_filblks_t fs_blocks;
int nmaps;
@@ -387,7 +386,7 @@ xfs_symlink(
xfs_trans_set_sync(tp);
}
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, NULL);
if (error)
goto out_bmap_cancel;
@@ -434,7 +433,6 @@ xfs_inactive_symlink_rmt(
struct xfs_inode *ip)
{
xfs_buf_t *bp;
- int committed;
int done;
int error;
xfs_fsblock_t first_block;
@@ -510,16 +508,10 @@ xfs_inactive_symlink_rmt(
/*
* Commit the first transaction. This logs the EFI and the inode.
*/
- error = xfs_bmap_finish(&tp, &free_list, &committed);
+ error = xfs_bmap_finish(&tp, &free_list, ip);
if (error)
goto error_bmap_cancel;
/*
- * The transaction must have been committed, since there were
- * actually extents freed by xfs_bunmapi. See xfs_bmap_finish.
- * The new tp has the extent freeing and EFDs.
- */
- ASSERT(committed);
- /*
* The first xact was committed, so add the inode to the new one.
* Mark it dirty so it will be logged and moved forward in the log as
* part of every commit.
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index a0c8067cea6f..aed74d3f8da9 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -19,6 +19,7 @@
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include "xfs_error.h"
+#include "xfs_stats.h"
static struct ctl_table_header *xfs_table_header;
@@ -31,22 +32,12 @@ xfs_stats_clear_proc_handler(
size_t *lenp,
loff_t *ppos)
{
- int c, ret, *valp = ctl->data;
- __uint32_t vn_active;
+ int ret, *valp = ctl->data;
ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
if (!ret && write && *valp) {
- xfs_notice(NULL, "Clearing xfsstats");
- for_each_possible_cpu(c) {
- preempt_disable();
- /* save vn_active, it's a universal truth! */
- vn_active = per_cpu(xfsstats, c).vn_active;
- memset(&per_cpu(xfsstats, c), 0,
- sizeof(struct xfsstats));
- per_cpu(xfsstats, c).vn_active = vn_active;
- preempt_enable();
- }
+ xfs_stats_clearall(xfsstats.xs_stats);
xfs_stats_clear = 0;
}
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index aa03670851d8..6ced4f143494 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -18,14 +18,19 @@
#include "xfs.h"
#include "xfs_sysfs.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
+#include "xfs_stats.h"
+#include "xfs_mount.h"
struct xfs_sysfs_attr {
struct attribute attr;
- ssize_t (*show)(char *buf, void *data);
- ssize_t (*store)(const char *buf, size_t count, void *data);
+ ssize_t (*show)(struct kobject *kobject, char *buf);
+ ssize_t (*store)(struct kobject *kobject, const char *buf,
+ size_t count);
};
static inline struct xfs_sysfs_attr *
@@ -38,17 +43,102 @@ to_attr(struct attribute *attr)
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
#define XFS_SYSFS_ATTR_RO(name) \
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+#define XFS_SYSFS_ATTR_WO(name) \
+ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name)
#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
+STATIC ssize_t
+xfs_sysfs_object_show(
+ struct kobject *kobject,
+ struct attribute *attr,
+ char *buf)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0;
+}
+
+STATIC ssize_t
+xfs_sysfs_object_store(
+ struct kobject *kobject,
+ struct attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0;
+}
+
+static const struct sysfs_ops xfs_sysfs_ops = {
+ .show = xfs_sysfs_object_show,
+ .store = xfs_sysfs_object_store,
+};
+
/*
- * xfs_mount kobject. This currently has no attributes and thus no need for show
- * and store helpers. The mp kobject serves as the per-mount parent object that
- * is identified by the fsname under sysfs.
+ * xfs_mount kobject. The mp kobject also serves as the per-mount parent object
+ * that is identified by the fsname under sysfs.
*/
+static inline struct xfs_mount *
+to_mp(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xfs_mount, m_kobj);
+}
+
+#ifdef DEBUG
+
+STATIC ssize_t
+fail_writes_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_mount *mp = to_mp(kobject);
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val == 1)
+ mp->m_fail_writes = true;
+ else if (val == 0)
+ mp->m_fail_writes = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+STATIC ssize_t
+fail_writes_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ struct xfs_mount *mp = to_mp(kobject);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0);
+}
+XFS_SYSFS_ATTR_RW(fail_writes);
+
+#endif /* DEBUG */
+
+static struct attribute *xfs_mp_attrs[] = {
+#ifdef DEBUG
+ ATTR_LIST(fail_writes),
+#endif
+ NULL,
+};
+
struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_mp_attrs,
};
#ifdef DEBUG
@@ -56,9 +146,9 @@ struct kobj_type xfs_mp_ktype = {
STATIC ssize_t
log_recovery_delay_store(
+ struct kobject *kobject,
const char *buf,
- size_t count,
- void *data)
+ size_t count)
{
int ret;
int val;
@@ -77,8 +167,8 @@ log_recovery_delay_store(
STATIC ssize_t
log_recovery_delay_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
}
@@ -89,52 +179,87 @@ static struct attribute *xfs_dbg_attrs[] = {
NULL,
};
+struct kobj_type xfs_dbg_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_dbg_attrs,
+};
+
+#endif /* DEBUG */
+
+/* stats */
+
+static inline struct xstats *
+to_xstats(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xstats, xs_kobj);
+}
+
STATIC ssize_t
-xfs_dbg_show(
- struct kobject *kobject,
- struct attribute *attr,
- char *buf)
+stats_show(
+ struct kobject *kobject,
+ char *buf)
{
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xstats *stats = to_xstats(kobject);
- return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
+ return xfs_stats_format(stats->xs_stats, buf);
}
+XFS_SYSFS_ATTR_RO(stats);
STATIC ssize_t
-xfs_dbg_store(
- struct kobject *kobject,
- struct attribute *attr,
- const char *buf,
- size_t count)
+stats_clear_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
{
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ int ret;
+ int val;
+ struct xstats *stats = to_xstats(kobject);
- return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val != 1)
+ return -EINVAL;
+
+ xfs_stats_clearall(stats->xs_stats);
+ return count;
}
+XFS_SYSFS_ATTR_WO(stats_clear);
-static struct sysfs_ops xfs_dbg_ops = {
- .show = xfs_dbg_show,
- .store = xfs_dbg_store,
+static struct attribute *xfs_stats_attrs[] = {
+ ATTR_LIST(stats),
+ ATTR_LIST(stats_clear),
+ NULL,
};
-struct kobj_type xfs_dbg_ktype = {
+struct kobj_type xfs_stats_ktype = {
.release = xfs_sysfs_release,
- .sysfs_ops = &xfs_dbg_ops,
- .default_attrs = xfs_dbg_attrs,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_stats_attrs,
};
-#endif /* DEBUG */
-
/* xlog */
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xlog, l_kobj);
+}
+
STATIC ssize_t
log_head_lsn_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int block;
+ struct xlog *log = to_xlog(kobject);
spin_lock(&log->l_icloglock);
cycle = log->l_curr_cycle;
@@ -147,12 +272,12 @@ XFS_SYSFS_ATTR_RO(log_head_lsn);
STATIC ssize_t
log_tail_lsn_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int block;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
@@ -161,12 +286,13 @@ XFS_SYSFS_ATTR_RO(log_tail_lsn);
STATIC ssize_t
reserve_grant_head_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
+
{
- struct xlog *log = data;
int cycle;
int bytes;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
@@ -175,65 +301,64 @@ XFS_SYSFS_ATTR_RO(reserve_grant_head);
STATIC ssize_t
write_grant_head_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int bytes;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
}
XFS_SYSFS_ATTR_RO(write_grant_head);
-static struct attribute *xfs_log_attrs[] = {
- ATTR_LIST(log_head_lsn),
- ATTR_LIST(log_tail_lsn),
- ATTR_LIST(reserve_grant_head),
- ATTR_LIST(write_grant_head),
- NULL,
-};
-
-static inline struct xlog *
-to_xlog(struct kobject *kobject)
-{
- struct xfs_kobj *kobj = to_kobj(kobject);
- return container_of(kobj, struct xlog, l_kobj);
-}
-
+#ifdef DEBUG
STATIC ssize_t
-xfs_log_show(
- struct kobject *kobject,
- struct attribute *attr,
- char *buf)
+log_badcrc_factor_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
{
- struct xlog *log = to_xlog(kobject);
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xlog *log = to_xlog(kobject);
+ int ret;
+ uint32_t val;
- return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
+ ret = kstrtouint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ log->l_badcrc_factor = val;
+
+ return count;
}
STATIC ssize_t
-xfs_log_store(
- struct kobject *kobject,
- struct attribute *attr,
- const char *buf,
- size_t count)
+log_badcrc_factor_show(
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = to_xlog(kobject);
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xlog *log = to_xlog(kobject);
- return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
+ return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor);
}
-static struct sysfs_ops xfs_log_ops = {
- .show = xfs_log_show,
- .store = xfs_log_store,
+XFS_SYSFS_ATTR_RW(log_badcrc_factor);
+#endif /* DEBUG */
+
+static struct attribute *xfs_log_attrs[] = {
+ ATTR_LIST(log_head_lsn),
+ ATTR_LIST(log_tail_lsn),
+ ATTR_LIST(reserve_grant_head),
+ ATTR_LIST(write_grant_head),
+#ifdef DEBUG
+ ATTR_LIST(log_badcrc_factor),
+#endif
+ NULL,
};
struct kobj_type xfs_log_ktype = {
.release = xfs_sysfs_release,
- .sysfs_ops = &xfs_log_ops,
+ .sysfs_ops = &xfs_sysfs_ops,
.default_attrs = xfs_log_attrs,
};
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 240eee35f342..be692e59938d 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -22,6 +22,7 @@
extern struct kobj_type xfs_mp_ktype; /* xfs_mount */
extern struct kobj_type xfs_dbg_ktype; /* debug */
extern struct kobj_type xfs_log_ktype; /* xlog */
+extern struct kobj_type xfs_stats_ktype; /* stats */
static inline struct xfs_kobj *
to_kobj(struct kobject *kobject)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5ed36b1e04c1..c8d58426008e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -689,6 +689,7 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
+DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
@@ -1221,6 +1222,32 @@ DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
+DECLARE_EVENT_CLASS(xfs_readpage_class,
+ TP_PROTO(struct inode *inode, int nr_pages),
+ TP_ARGS(inode, nr_pages),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, nr_pages)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->nr_pages = nr_pages;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->nr_pages)
+)
+
+#define DEFINE_READPAGE_EVENT(name) \
+DEFINE_EVENT(xfs_readpage_class, name, \
+ TP_PROTO(struct inode *inode, int nr_pages), \
+ TP_ARGS(inode, nr_pages))
+DEFINE_READPAGE_EVENT(xfs_vm_readpage);
+DEFINE_READPAGE_EVENT(xfs_vm_readpages);
+
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int type, struct xfs_bmbt_irec *irec),
@@ -1269,11 +1296,7 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1312,6 +1335,10 @@ DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
+DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index a0ab1dae9c31..20c53666cb4b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -930,9 +930,9 @@ __xfs_trans_commit(
*/
if (sync) {
error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
- XFS_STATS_INC(xs_trans_sync);
+ XFS_STATS_INC(mp, xs_trans_sync);
} else {
- XFS_STATS_INC(xs_trans_async);
+ XFS_STATS_INC(mp, xs_trans_async);
}
return error;
@@ -955,7 +955,7 @@ out_unreserve:
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
- XFS_STATS_INC(xs_trans_empty);
+ XFS_STATS_INC(mp, xs_trans_empty);
return error;
}
@@ -1028,6 +1028,8 @@ __xfs_trans_roll(
struct xfs_trans_res tres;
int error;
+ *committed = 0;
+
/*
* Ensure that the inode is always logged.
*/
@@ -1082,6 +1084,6 @@ xfs_trans_roll(
struct xfs_trans **tpp,
struct xfs_inode *dp)
{
- int committed = 0;
+ int committed;
return __xfs_trans_roll(tpp, dp, &committed);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 4643070d7cae..e7c49cf43fbc 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -133,7 +133,6 @@ typedef struct xfs_trans {
* XFS transaction mechanism exported interfaces that are
* actually macros.
*/
-#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res)
#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
#if defined(DEBUG) || defined(XFS_WARN)
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1098cf490189..d6c9c3e9e02b 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -349,7 +349,7 @@ xfsaild_push(
xfs_ail_min_lsn(ailp))) {
ailp->xa_log_flush = 0;
- XFS_STATS_INC(xs_push_ail_flush);
+ XFS_STATS_INC(mp, xs_push_ail_flush);
xfs_log_force(mp, XFS_LOG_SYNC);
}
@@ -371,7 +371,7 @@ xfsaild_push(
goto out_done;
}
- XFS_STATS_INC(xs_push_ail);
+ XFS_STATS_INC(mp, xs_push_ail);
lsn = lip->li_lsn;
while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
@@ -385,7 +385,7 @@ xfsaild_push(
lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
switch (lock_result) {
case XFS_ITEM_SUCCESS:
- XFS_STATS_INC(xs_push_ail_success);
+ XFS_STATS_INC(mp, xs_push_ail_success);
trace_xfs_ail_push(lip);
ailp->xa_last_pushed_lsn = lsn;
@@ -403,7 +403,7 @@ xfsaild_push(
* re-try the flushing relatively soon if most of the
* AIL is beeing flushed.
*/
- XFS_STATS_INC(xs_push_ail_flushing);
+ XFS_STATS_INC(mp, xs_push_ail_flushing);
trace_xfs_ail_flushing(lip);
flushing++;
@@ -411,14 +411,14 @@ xfsaild_push(
break;
case XFS_ITEM_PINNED:
- XFS_STATS_INC(xs_push_ail_pinned);
+ XFS_STATS_INC(mp, xs_push_ail_pinned);
trace_xfs_ail_pinned(lip);
stuck++;
ailp->xa_log_flush++;
break;
case XFS_ITEM_LOCKED:
- XFS_STATS_INC(xs_push_ail_locked);
+ XFS_STATS_INC(mp, xs_push_ail_locked);
trace_xfs_ail_locked(lip);
stuck++;
@@ -497,6 +497,7 @@ xfsaild(
long tout = 0; /* milliseconds */
current->flags |= PF_MEMALLOC;
+ set_freezable();
while (!kthread_should_stop()) {
if (tout && tout <= 20)
@@ -519,14 +520,14 @@ xfsaild(
if (!xfs_ail_min(ailp) &&
ailp->xa_target == ailp->xa_target_prev) {
spin_unlock(&ailp->xa_lock);
- schedule();
+ freezable_schedule();
tout = 0;
continue;
}
spin_unlock(&ailp->xa_lock);
if (tout)
- schedule_timeout(msecs_to_jiffies(tout));
+ freezable_schedule_timeout(msecs_to_jiffies(tout));
__set_current_state(TASK_RUNNING);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 75798412859a..8ee29ca132dc 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -155,7 +155,7 @@ xfs_trans_get_buf_map(
ASSERT(xfs_buf_islocked(bp));
if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
xfs_buf_stale(bp);
- XFS_BUF_DONE(bp);
+ bp->b_flags |= XBF_DONE;
}
ASSERT(bp->b_transp == tp);
@@ -518,7 +518,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
* inside the b_bdstrat callback so that this won't get written to
* disk.
*/
- XFS_BUF_DONE(bp);
+ bp->b_flags |= XBF_DONE;
ASSERT(atomic_read(&bip->bli_refcount) > 0);
bp->b_iodone = xfs_buf_iodone_callbacks;
@@ -534,8 +534,8 @@ xfs_trans_log_buf(xfs_trans_t *tp,
*/
if (bip->bli_flags & XFS_BLI_STALE) {
bip->bli_flags &= ~XFS_BLI_STALE;
- ASSERT(XFS_BUF_ISSTALE(bp));
- XFS_BUF_UNSTALE(bp);
+ ASSERT(bp->b_flags & XBF_STALE);
+ bp->b_flags &= ~XBF_STALE;
bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
}
@@ -600,7 +600,7 @@ xfs_trans_binval(
* If the buffer is already invalidated, then
* just return.
*/
- ASSERT(XFS_BUF_ISSTALE(bp));
+ ASSERT(bp->b_flags & XBF_STALE);
ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index ce78534a047e..c3d547211d16 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -572,12 +572,16 @@ xfs_quota_warn(
struct xfs_dquot *dqp,
int type)
{
- /* no warnings for project quotas - we just return ENOSPC later */
+ enum quota_type qtype;
+
if (dqp->dq_flags & XFS_DQ_PROJ)
- return;
- quota_send_warning(make_kqid(&init_user_ns,
- (dqp->dq_flags & XFS_DQ_USER) ?
- USRQUOTA : GRPQUOTA,
+ qtype = PRJQUOTA;
+ else if (dqp->dq_flags & XFS_DQ_USER)
+ qtype = USRQUOTA;
+ else
+ qtype = GRPQUOTA;
+
+ quota_send_warning(make_kqid(&init_user_ns, qtype,
be32_to_cpu(dqp->q_core.d_id)),
mp->m_super->s_dev, type);
}
@@ -605,17 +609,20 @@ xfs_trans_dqresv(
xfs_qcnt_t total_count;
xfs_qcnt_t *resbcountp;
xfs_quotainfo_t *q = mp->m_quotainfo;
+ struct xfs_def_quota *defq;
xfs_dqlock(dqp);
+ defq = xfs_get_defquota(dqp, q);
+
if (flags & XFS_TRANS_DQ_RES_BLKS) {
hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
if (!hardlimit)
- hardlimit = q->qi_bhardlimit;
+ hardlimit = defq->bhardlimit;
softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
if (!softlimit)
- softlimit = q->qi_bsoftlimit;
+ softlimit = defq->bsoftlimit;
timer = be32_to_cpu(dqp->q_core.d_btimer);
warns = be16_to_cpu(dqp->q_core.d_bwarns);
warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
@@ -624,10 +631,10 @@ xfs_trans_dqresv(
ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
if (!hardlimit)
- hardlimit = q->qi_rtbhardlimit;
+ hardlimit = defq->rtbhardlimit;
softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
if (!softlimit)
- softlimit = q->qi_rtbsoftlimit;
+ softlimit = defq->rtbsoftlimit;
timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
@@ -668,10 +675,10 @@ xfs_trans_dqresv(
warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
if (!hardlimit)
- hardlimit = q->qi_ihardlimit;
+ hardlimit = defq->ihardlimit;
softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
if (!softlimit)
- softlimit = q->qi_isoftlimit;
+ softlimit = defq->isoftlimit;
if (hardlimit && total_count > hardlimit) {
xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 17280cd71934..11a3af08b5c7 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -75,18 +75,10 @@ xfs_trans_ichgtime(
tv = current_fs_time(inode->i_sb);
- if ((flags & XFS_ICHGTIME_MOD) &&
- !timespec_equal(&inode->i_mtime, &tv)) {
+ if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
- ip->i_d.di_mtime.t_sec = tv.tv_sec;
- ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
- }
- if ((flags & XFS_ICHGTIME_CHG) &&
- !timespec_equal(&inode->i_ctime, &tv)) {
+ if (flags & XFS_ICHGTIME_CHG)
inode->i_ctime = tv;
- ip->i_d.di_ctime.t_sec = tv.tv_sec;
- ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
- }
}
/*
@@ -108,6 +100,15 @@ xfs_trans_log_inode(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
/*
+ * Record the specific change for fdatasync optimisation. This
+ * allows fdatasync to skip log forces for inodes that are only
+ * timestamp dirty. We do this before the change count so that
+ * the core being logged in this case does not impact on fdatasync
+ * behaviour.
+ */
+ ip->i_itemp->ili_fsync_fields |= flags;
+
+ /*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. We don't use
* inode_inc_version() because there is no need for extra locking around
@@ -116,7 +117,7 @@ xfs_trans_log_inode(
*/
if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
IS_I_VERSION(VFS_I(ip))) {
- ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
+ VFS_I(ip)->i_version++;
flags |= XFS_ILOG_CORE;
}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index c036815183cb..110f1d7d86b0 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -32,15 +32,13 @@
static int
-xfs_xattr_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int xflags)
+xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *value, size_t size)
{
+ int xflags = handler->flags;
struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error, asize = size;
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
/* Convert Linux syscall to XFS internal ATTR flags */
if (!size) {
xflags |= ATTR_KERNOVAL;
@@ -53,14 +51,35 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
return asize;
}
-static int
-xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int xflags)
+void
+xfs_forget_acl(
+ struct inode *inode,
+ const char *name,
+ int xflags)
{
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ /*
+ * Invalidate any cached ACLs if the user has bypassed the ACL
+ * interface. We don't validate the content whatsoever so it is caller
+ * responsibility to provide data in valid format and ensure i_mode is
+ * consistent.
+ */
+ if (xflags & ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+ if (!strcmp(name, SGI_ACL_FILE))
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
+ else if (!strcmp(name, SGI_ACL_DEFAULT))
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+#endif
+ }
+}
- if (strcmp(name, "") == 0)
- return -EINVAL;
+static int
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *value, size_t size, int flags)
+{
+ int xflags = handler->flags;
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ int error;
/* Convert Linux syscall to XFS internal ATTR flags */
if (flags & XATTR_CREATE)
@@ -70,8 +89,12 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
if (!value)
return xfs_attr_remove(ip, (unsigned char *)name, xflags);
- return xfs_attr_set(ip, (unsigned char *)name,
+ error = xfs_attr_set(ip, (unsigned char *)name,
(void *)value, size, xflags);
+ if (!error)
+ xfs_forget_acl(d_inode(dentry), name, xflags);
+
+ return error;
}
static const struct xattr_handler xfs_xattr_user_handler = {
@@ -106,47 +129,19 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
NULL
};
-static unsigned int xfs_xattr_prefix_len(int flags)
-{
- if (flags & XFS_ATTR_SECURE)
- return sizeof("security");
- else if (flags & XFS_ATTR_ROOT)
- return sizeof("trusted");
- else
- return sizeof("user");
-}
-
-static const char *xfs_xattr_prefix(int flags)
-{
- if (flags & XFS_ATTR_SECURE)
- return xfs_xattr_security_handler.prefix;
- else if (flags & XFS_ATTR_ROOT)
- return xfs_xattr_trusted_handler.prefix;
- else
- return xfs_xattr_user_handler.prefix;
-}
-
static int
-xfs_xattr_put_listent(
+__xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
- int flags,
- unsigned char *name,
- int namelen,
- int valuelen,
- unsigned char *value)
+ char *prefix,
+ int prefix_len,
+ unsigned char *name,
+ int namelen)
{
- unsigned int prefix_len = xfs_xattr_prefix_len(flags);
char *offset;
int arraytop;
- ASSERT(context->count >= 0);
-
- /*
- * Only show root namespace entries if we are actually allowed to
- * see them.
- */
- if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
- return 0;
+ if (!context->alist)
+ goto compute_size;
arraytop = context->count + prefix_len + namelen + 1;
if (arraytop > context->firstu) {
@@ -154,17 +149,19 @@ xfs_xattr_put_listent(
return 1;
}
offset = (char *)context->alist + context->count;
- strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+ strncpy(offset, prefix, prefix_len);
offset += prefix_len;
strncpy(offset, (char *)name, namelen); /* real name */
offset += namelen;
*offset = '\0';
+
+compute_size:
context->count += prefix_len + namelen + 1;
return 0;
}
static int
-xfs_xattr_put_listent_sizes(
+xfs_xattr_put_listent(
struct xfs_attr_list_context *context,
int flags,
unsigned char *name,
@@ -172,24 +169,55 @@ xfs_xattr_put_listent_sizes(
int valuelen,
unsigned char *value)
{
- context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
- return 0;
-}
+ char *prefix;
+ int prefix_len;
-static int
-list_one_attr(const char *name, const size_t len, void *data,
- size_t size, ssize_t *result)
-{
- char *p = data + *result;
+ ASSERT(context->count >= 0);
- *result += len;
- if (!size)
- return 0;
- if (*result > size)
- return -ERANGE;
+ if (flags & XFS_ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+ if (namelen == SGI_ACL_FILE_SIZE &&
+ strncmp(name, SGI_ACL_FILE,
+ SGI_ACL_FILE_SIZE) == 0) {
+ int ret = __xfs_xattr_put_listent(
+ context, XATTR_SYSTEM_PREFIX,
+ XATTR_SYSTEM_PREFIX_LEN,
+ XATTR_POSIX_ACL_ACCESS,
+ strlen(XATTR_POSIX_ACL_ACCESS));
+ if (ret)
+ return ret;
+ } else if (namelen == SGI_ACL_DEFAULT_SIZE &&
+ strncmp(name, SGI_ACL_DEFAULT,
+ SGI_ACL_DEFAULT_SIZE) == 0) {
+ int ret = __xfs_xattr_put_listent(
+ context, XATTR_SYSTEM_PREFIX,
+ XATTR_SYSTEM_PREFIX_LEN,
+ XATTR_POSIX_ACL_DEFAULT,
+ strlen(XATTR_POSIX_ACL_DEFAULT));
+ if (ret)
+ return ret;
+ }
+#endif
- strcpy(p, name);
- return 0;
+ /*
+ * Only show root namespace entries if we are actually allowed to
+ * see them.
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
+ prefix = XATTR_TRUSTED_PREFIX;
+ prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+ } else if (flags & XFS_ATTR_SECURE) {
+ prefix = XATTR_SECURITY_PREFIX;
+ prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ } else {
+ prefix = XATTR_USER_PREFIX;
+ prefix_len = XATTR_USER_PREFIX_LEN;
+ }
+
+ return __xfs_xattr_put_listent(context, prefix, prefix_len, name,
+ namelen);
}
ssize_t
@@ -198,7 +226,6 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
struct xfs_attr_list_context context;
struct attrlist_cursor_kern cursor = { 0 };
struct inode *inode = d_inode(dentry);
- int error;
/*
* First read the regular on-disk attributes.
@@ -207,37 +234,14 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
context.dp = XFS_I(inode);
context.cursor = &cursor;
context.resynch = 1;
- context.alist = data;
+ context.alist = size ? data : NULL;
context.bufsize = size;
context.firstu = context.bufsize;
-
- if (size)
- context.put_listent = xfs_xattr_put_listent;
- else
- context.put_listent = xfs_xattr_put_listent_sizes;
+ context.put_listent = xfs_xattr_put_listent;
xfs_attr_list_int(&context);
if (context.count < 0)
return -ERANGE;
- /*
- * Then add the two synthetic ACL attributes.
- */
- if (posix_acl_access_exists(inode)) {
- error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
- strlen(POSIX_ACL_XATTR_ACCESS) + 1,
- data, size, &context.count);
- if (error)
- return error;
- }
-
- if (posix_acl_default_exists(inode)) {
- error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
- strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
- data, size, &context.count);
- if (error)
- return error;
- }
-
return context.count;
}